In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from itertools import cycle
from sklearn.metrics import auc, roc_curve
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import RocCurveDisplay
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [ ]:
#Import AirBnB listings data from Austin
df = pd.read_csv('/content/drive/MyDrive/airbnb_listings_austin.csv')
In [ ]:
#See data types in data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5835 entries, 0 to 5834 Data columns (total 54 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5835 non-null int64 1 listing_url 5835 non-null object 2 name 5835 non-null object 3 summary 5373 non-null object 4 space 4475 non-null object 5 description 5832 non-null object 6 experiences_offered 5835 non-null object 7 neighborhood_overview 3572 non-null object 8 notes 2412 non-null object 9 transit 3492 non-null object 10 host_id 5835 non-null int64 11 host_name 5820 non-null object 12 host_since 5820 non-null object 13 host_location 5810 non-null object 14 host_about 3974 non-null object 15 host_response_time 4177 non-null object 16 host_response_rate 4177 non-null object 17 host_is_superhost 5820 non-null object 18 host_listings_count 5820 non-null float64 19 host_has_profile_pic 5820 non-null object 20 host_identity_verified 5820 non-null object 21 neighbourhood 4800 non-null object 22 city 5835 non-null object 23 property_type 5835 non-null object 24 room_type 5835 non-null object 25 accommodates 5835 non-null int64 26 bathrooms 5789 non-null float64 27 bedrooms 5829 non-null float64 28 beds 5812 non-null float64 29 bed_type 5835 non-null object 30 amenities 5835 non-null object 31 square_feet 302 non-null float64 32 price 5835 non-null object 33 weekly_price 2227 non-null object 34 security_deposit 2770 non-null object 35 cleaning_fee 3587 non-null object 36 guests_included 5835 non-null int64 37 extra_people 5835 non-null object 38 minimum_nights 5835 non-null int64 39 has_availability 5835 non-null object 40 availability_30 5835 non-null int64 41 availability_60 5835 non-null int64 42 availability_90 5835 non-null int64 43 availability_365 5835 non-null int64 44 number_of_reviews 5835 non-null int64 45 review_scores_rating 3789 non-null float64 46 review_scores_accuracy 3776 non-null float64 47 review_scores_cleanliness 3778 non-null float64 48 review_scores_checkin 3778 non-null float64 49 review_scores_communication 3778 non-null float64 50 review_scores_location 3779 non-null float64 51 review_scores_value 3778 non-null float64 52 instant_bookable 5835 non-null object 53 cancellation_policy 5835 non-null object dtypes: float64(12), int64(10), object(32) memory usage: 2.4+ MB
In [ ]:
#Get number of nulls in each column
df.isnull().sum()
Out[ ]:
id 0 listing_url 0 name 0 summary 462 space 1360 description 3 experiences_offered 0 neighborhood_overview 2263 notes 3423 transit 2343 host_id 0 host_name 15 host_since 15 host_location 25 host_about 1861 host_response_time 1658 host_response_rate 1658 host_is_superhost 15 host_listings_count 15 host_has_profile_pic 15 host_identity_verified 15 neighbourhood 1035 city 0 property_type 0 room_type 0 accommodates 0 bathrooms 46 bedrooms 6 beds 23 bed_type 0 amenities 0 square_feet 5533 price 0 weekly_price 3608 security_deposit 3065 cleaning_fee 2248 guests_included 0 extra_people 0 minimum_nights 0 has_availability 0 availability_30 0 availability_60 0 availability_90 0 availability_365 0 number_of_reviews 0 review_scores_rating 2046 review_scores_accuracy 2059 review_scores_cleanliness 2057 review_scores_checkin 2057 review_scores_communication 2057 review_scores_location 2056 review_scores_value 2057 instant_bookable 0 cancellation_policy 0 dtype: int64
In [ ]:
#Get number of unqiue values or types in each column
#This helps with deciding what to make dummies of for non-numerical data
df.nunique()
Out[ ]:
id 5835 listing_url 5835 name 5784 summary 5261 space 4421 description 5791 experiences_offered 1 neighborhood_overview 3379 notes 2155 transit 3306 host_id 4633 host_name 1888 host_since 1578 host_location 171 host_about 2946 host_response_time 4 host_response_rate 50 host_is_superhost 2 host_listings_count 24 host_has_profile_pic 2 host_identity_verified 2 neighbourhood 79 city 12 property_type 18 room_type 3 accommodates 16 bathrooms 16 bedrooms 10 beds 16 bed_type 5 amenities 4474 square_feet 108 price 468 weekly_price 475 security_deposit 76 cleaning_fee 100 guests_included 16 extra_people 59 minimum_nights 26 has_availability 1 availability_30 31 availability_60 61 availability_90 91 availability_365 362 number_of_reviews 163 review_scores_rating 41 review_scores_accuracy 8 review_scores_cleanliness 9 review_scores_checkin 7 review_scores_communication 7 review_scores_location 7 review_scores_value 8 instant_bookable 2 cancellation_policy 5 dtype: int64
In [ ]:
#Brief view of data
df
Out[ ]:
| id | listing_url | name | summary | space | description | experiences_offered | neighborhood_overview | notes | transit | ... | number_of_reviews | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | cancellation_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 72635 | https://www.airbnb.com/rooms/72635 | 3 Private Bedrooms, SW Austin | Conveniently located 10-15 from downtown in SW... | We have three spare bedrooms, each with a quee... | Conveniently located 10-15 from downtown in SW... | none | Location and convenience are key. Easy access... | NaN | Unfortunately there is no convenient public tr... | ... | 1 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | f | moderate |
| 1 | 5386323 | https://www.airbnb.com/rooms/5386323 | Cricket Trailer | Rent this cool concept trailer that has everyt... | Rental arrangements for this trailer allows yo... | Rent this cool concept trailer that has everyt... | none | We're talking about wherever you'd like in the... | NaN | Bike, Bus, Metrorail, etc. you name it we've g... | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | f | moderate |
| 2 | 8826517 | https://www.airbnb.com/rooms/8826517 | Private room 1 in South Austin | Upstairs, private, 12ft x 13 1/2ft room. Priv... | NaN | Upstairs, private, 12ft x 13 1/2ft room. Priv... | none | NaN | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | f | flexible |
| 3 | 8828616 | https://www.airbnb.com/rooms/8828616 | Private room 2 in South Austin | Upstairs, private, 11ft x 13 1/2ft room. Priv... | NaN | Upstairs, private, 11ft x 13 1/2ft room. Priv... | none | NaN | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | f | flexible |
| 4 | 8536913 | https://www.airbnb.com/rooms/8536913 | Brand-New 3BR Austin Home | Brand-new 3BR/2BA Austin home with landscaped ... | Feel instantly at home at our brand new 3BR/2B... | Brand-new 3BR/2BA Austin home with landscaped ... | none | Entertainment and activities are plentiful her... | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | f | strict |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5830 | 6063670 | https://www.airbnb.com/rooms/6063670 | Austin's Downtown Garden Suite | Enjoy being literally steps from everything th... | If you are looking for the perfect suite in th... | Enjoy being literally steps from everything th... | none | I love that the downtown neighborhood is so vi... | If you are interested in hosting an even large... | In addition to the Airport Flyer that I alread... | ... | 9 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 9.0 | f | strict |
| 5831 | 8422925 | https://www.airbnb.com/rooms/8422925 | Two beds in Downtown Austin! | Prime location for the Austin Convention Cente... | Located in the heart of downtown, this room co... | Prime location for the Austin Convention Cente... | none | This truly is in the middle of everything goin... | NaN | Buses leave from across the street (including ... | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | f | moderate |
| 5832 | 3345881 | https://www.airbnb.com/rooms/3345881 | Casa Romántica en Picos de Europa | Axtur: Picos de Europa. Desfiladero del Sella ... | Una casa excepcional en un paisaje excepcional... | Una casa excepcional en un paisaje excepcional... | none | Pueblecito asturiano, con muy pocos vecinos, d... | Paisaje y tranquilidad. | En Coche | ... | 1 | 100.0 | 8.0 | 10.0 | 10.0 | 10.0 | 10.0 | 8.0 | t | strict |
| 5833 | 8954997 | https://www.airbnb.com/rooms/8954997 | Living room with bed | Living room with bed have bathroom. | NaN | Living room with bed have bathroom. | none | NaN | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | f | flexible |
| 5834 | 7618185 | https://www.airbnb.com/rooms/7618185 | Comfy 1 bedroom in North Austin | NaN | Cozy one bedroom/one bath 1st floor apartment ... | Cozy one bedroom/one bath 1st floor apartment ... | none | NaN | The security deposit may be forfeited in the e... | Close to grocery stores, restaurants and a mov... | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | f | strict |
5835 rows × 54 columns
In [ ]:
#Impute columns (vars) with a value like f for false
def impute(value, vars):
for var in vars:
df[var][df[var].isnull() == True] = value
print(f'{var} nulls:', df[var].isnull().sum())
In [ ]:
#Make dummies when a column consists of t for true and f for false
def make_binary_dummies(vars):
for var in vars:
df[var][df[var] == 'f'] = 0
df[var][df[var] == 't'] = 1
print(df[var].head(20))
In [ ]:
#Make categorical columns into numerical dummies where each category is replaced
#by a number starting from 0
#Used in Pandas apply function
def dummyize(x, columns):
for column in range(len(columns)):
if x == columns[column]:
return column
In [ ]:
#Impute all nulls with false
impute('f', ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable'])
host_is_superhost nulls: 0 host_has_profile_pic nulls: 0 host_identity_verified nulls: 0 instant_bookable nulls: 0
<ipython-input-885-5be6f4368e2b>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var].isnull() == True] = value
In [ ]:
#Impute cleaning_fee and security_deposit with $0
impute('$0', ['cleaning_fee', 'security_deposit'])
cleaning_fee nulls: 0 security_deposit nulls: 0
<ipython-input-885-5be6f4368e2b>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var].isnull() == True] = value
In [ ]:
#Impute bathrooms, bedrooms, and beds with 1 of each
impute(1, ['bathrooms', 'bedrooms', 'beds'])
bathrooms nulls: 0 bedrooms nulls: 0 beds nulls: 0
<ipython-input-885-5be6f4368e2b>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var].isnull() == True] = value
In [ ]:
#Impute missing host since with 12/6/2023
impute('12/6/2023', ['host_since'])
host_since nulls: 0
<ipython-input-885-5be6f4368e2b>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var].isnull() == True] = value
In [ ]:
#Convert price columns into floats
for var in ['price', 'weekly_price', 'security_deposit', 'extra_people', 'cleaning_fee']:
df[var] = df[var].replace('\$', '', regex=True).replace(',', '', regex=True).astype(float)
print(df[var].head())
0 300.0 1 99.0 2 100.0 3 100.0 4 599.0 Name: price, dtype: float64 0 NaN 1 600.0 2 NaN 3 NaN 4 NaN Name: weekly_price, dtype: float64 0 0.0 1 1000.0 2 0.0 3 0.0 4 0.0 Name: security_deposit, dtype: float64 0 0.0 1 0.0 2 0.0 3 0.0 4 0.0 Name: extra_people, dtype: float64 0 0.0 1 75.0 2 0.0 3 0.0 4 125.0 Name: cleaning_fee, dtype: float64
In [ ]:
#Store current weekly price values for checking if imputing was done correctly
#where the null were
df['old_weekly_price'] = df['weekly_price']
In [ ]:
#Impute weekly_price with the price times 7
df['weekly_price'][df['weekly_price'].isnull() == True] = df['price'] * 7
print('weekly_price nulls:', df[var].isnull().sum())
weekly_price nulls: 0
<ipython-input-894-41f3e5500f06>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['weekly_price'][df['weekly_price'].isnull() == True] = df['price'] * 7
In [ ]:
#Check that the imputation was done correctly
df[['price', 'weekly_price']][df['old_weekly_price'].isnull() == True].head(60)
Out[ ]:
| price | weekly_price | |
|---|---|---|
| 0 | 300.0 | 2100.0 |
| 2 | 100.0 | 700.0 |
| 3 | 100.0 | 700.0 |
| 4 | 599.0 | 4193.0 |
| 5 | 100.0 | 700.0 |
| 6 | 54.0 | 378.0 |
| 7 | 40.0 | 280.0 |
| 11 | 50.0 | 350.0 |
| 14 | 55.0 | 385.0 |
| 18 | 59.0 | 413.0 |
| 20 | 40.0 | 280.0 |
| 21 | 150.0 | 1050.0 |
| 22 | 175.0 | 1225.0 |
| 24 | 120.0 | 840.0 |
| 25 | 450.0 | 3150.0 |
| 26 | 49.0 | 343.0 |
| 29 | 50.0 | 350.0 |
| 33 | 200.0 | 1400.0 |
| 36 | 155.0 | 1085.0 |
| 37 | 700.0 | 4900.0 |
| 38 | 819.0 | 5733.0 |
| 39 | 1200.0 | 8400.0 |
| 40 | 1250.0 | 8750.0 |
| 45 | 600.0 | 4200.0 |
| 46 | 200.0 | 1400.0 |
| 49 | 629.0 | 4403.0 |
| 51 | 469.0 | 3283.0 |
| 52 | 1500.0 | 10500.0 |
| 54 | 250.0 | 1750.0 |
| 55 | 499.0 | 3493.0 |
| 62 | 165.0 | 1155.0 |
| 63 | 115.0 | 805.0 |
| 64 | 909.0 | 6363.0 |
| 69 | 350.0 | 2450.0 |
| 70 | 130.0 | 910.0 |
| 73 | 600.0 | 4200.0 |
| 75 | 65.0 | 455.0 |
| 78 | 450.0 | 3150.0 |
| 79 | 6500.0 | 45500.0 |
| 80 | 249.0 | 1743.0 |
| 84 | 300.0 | 2100.0 |
| 87 | 88.0 | 616.0 |
| 90 | 200.0 | 1400.0 |
| 91 | 150.0 | 1050.0 |
| 93 | 625.0 | 4375.0 |
| 95 | 250.0 | 1750.0 |
| 97 | 119.0 | 833.0 |
| 99 | 199.0 | 1393.0 |
| 100 | 97.0 | 679.0 |
| 103 | 118.0 | 826.0 |
| 104 | 163.0 | 1141.0 |
| 105 | 400.0 | 2800.0 |
| 106 | 125.0 | 875.0 |
| 107 | 169.0 | 1183.0 |
| 109 | 300.0 | 2100.0 |
| 110 | 75.0 | 525.0 |
| 111 | 60.0 | 420.0 |
| 112 | 500.0 | 3500.0 |
| 113 | 157.0 | 1099.0 |
| 116 | 399.0 | 2793.0 |
In [ ]:
df['host_location'].head(60)
Out[ ]:
0 Austin, Texas, United States 1 Austin, Texas, United States 2 Austin, Texas, United States 3 Austin, Texas, United States 4 US 5 Austin, Texas, United States 6 Austin, Texas, United States 7 Austin, Texas, United States 8 US 9 Austin, Texas, United States 10 US 11 Austin, Texas, United States 12 Austin, Texas, United States 13 Austin, Texas, United States 14 Austin, Texas, United States 15 Austin, Texas, United States 16 Austin, Texas, United States 17 Austin, Texas, United States 18 Austin, Texas, United States 19 Austin, Texas, United States 20 Austin, Texas, United States 21 Austin, Texas, United States 22 Austin, Texas, United States 23 Austin, Texas, United States 24 US 25 US 26 Austin, Texas, United States 27 Austin, Texas, United States 28 Austin, Texas, United States 29 Austin, Texas, United States 30 Austin, Texas, United States 31 Austin, Texas, United States 32 US 33 Austin, Texas, United States 34 Austin, Texas, United States 35 Austin, Texas, United States 36 Austin, Texas, United States 37 US 38 US 39 Austin, Texas, United States 40 Austin, Texas, United States 41 US 42 US 43 US 44 Austin, Texas, United States 45 US 46 Austin, Texas, United States 47 New York, New York, United States 48 Austin, Texas, United States 49 US 50 Austin, Texas, United States 51 US 52 US 53 US 54 US 55 US 56 Austin, Texas, United States 57 Austin, Texas, United States 58 Austin, Texas, United States 59 Austin, Texas, United States Name: host_location, dtype: object
In [ ]:
#Make host_location 1 for if a host is from Austin and 0 if the host is not
#List of cities that are or the suburbs of Austin in the data
Austin_city_list = ['Austin, Texas, United States', 'Austin, Texas', 'Austin',\
'Dripping Springs, Texas, United States', 'Sunset Valley, Texas, United States', \
'West Lake Hills, Texas, United States', 'Round Rock, Texas, United States', \
'Pflugerville, Texas, United States']
df['host_location'][df['host_location'].isin(Austin_city_list)] = 1
df['host_location'][df['host_location'] != 1] = 0
df['host_location'].head(60)
<ipython-input-897-77b4485cd734>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['host_location'][df['host_location'].isin(Austin_city_list)] = 1 <ipython-input-897-77b4485cd734>:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['host_location'][df['host_location'] != 1] = 0
Out[ ]:
0 1 1 1 2 1 3 1 4 0 5 1 6 1 7 1 8 0 9 1 10 0 11 1 12 1 13 1 14 1 15 1 16 1 17 1 18 1 19 1 20 1 21 1 22 1 23 1 24 0 25 0 26 1 27 1 28 1 29 1 30 1 31 1 32 0 33 1 34 1 35 1 36 1 37 0 38 0 39 1 40 1 41 0 42 0 43 0 44 1 45 0 46 1 47 0 48 1 49 0 50 1 51 0 52 0 53 0 54 0 55 0 56 1 57 1 58 1 59 1 Name: host_location, dtype: object
In [ ]:
#Drop listing from Spain
df['city'].drop(df['city'].str.contains('Cangas de Onís').index, inplace=True)
In [ ]:
#Drop columns that will not be used in making models
#Drop temporary old_weekly_price column
df.drop(['old_weekly_price', 'experiences_offered', 'has_availability', 'id', \
'listing_url', 'name', 'summary', 'city', 'description', 'notes', \
'neighborhood_overview', 'host_about', 'amenities', 'transit', 'host_id', \
'host_name', 'host_about', 'neighbourhood', 'square_feet', 'space'], axis = 1, inplace=True)
In [ ]:
df.dropna(subset=['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', \
'review_scores_checkin', 'review_scores_communication', 'review_scores_location', \
'review_scores_value', 'host_response_time', 'host_response_rate'], inplace=True)
In [ ]:
make_binary_dummies(['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable'])
7 0 9 1 11 0 12 1 13 1 14 1 16 0 19 1 23 0 26 1 27 0 28 0 29 0 34 1 35 0 37 0 39 0 40 0 44 1 46 0 Name: host_is_superhost, dtype: object 7 1 9 1 11 1 12 1 13 1 14 1 16 1 19 1 23 1 26 1 27 1 28 1 29 1 34 1 35 1 37 1 39 1 40 1 44 1 46 1 Name: host_has_profile_pic, dtype: object 7 1 9 1 11 1 12 1 13 1 14 0 16 1 19 1 23 1 26 1 27 1 28 1 29 1 34 1 35 1 37 1 39 1 40 1 44 1 46 1 Name: host_identity_verified, dtype: object 7 1 9 0 11 0 12 0 13 0 14 0 16 0 19 0 23 0 26 0 27 0 28 0 29 0 34 0 35 0 37 0 39 0 40 0 44 1 46 0 Name: instant_bookable, dtype: object
<ipython-input-886-78d3eb0f1892>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var] == 'f'] = 0 <ipython-input-886-78d3eb0f1892>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var] == 't'] = 1 <ipython-input-886-78d3eb0f1892>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var] == 'f'] = 0 <ipython-input-886-78d3eb0f1892>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var] == 't'] = 1 <ipython-input-886-78d3eb0f1892>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var] == 'f'] = 0 <ipython-input-886-78d3eb0f1892>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var] == 't'] = 1 <ipython-input-886-78d3eb0f1892>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var] == 'f'] = 0 <ipython-input-886-78d3eb0f1892>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[var][df[var] == 't'] = 1
In [ ]:
#Get unique room types
df["room_type"].unique()
Out[ ]:
array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)
In [ ]:
#Get unique room types
df["cancellation_policy"].unique()
Out[ ]:
array(['moderate', 'strict', 'flexible', 'super_strict_30', 'no_refunds'],
dtype=object)
In [ ]:
#Convert categorical columns into dummies
df["property_type"] = df["property_type"].apply(lambda x: dummyize(x, pd.get_dummies(df['property_type']).columns))
df["room_type"] = df["room_type"].apply(lambda x: dummyize(x, pd.get_dummies(df['room_type']).columns))
df["bed_type"] = df["bed_type"].apply(lambda x: dummyize(x, pd.get_dummies(df['bed_type']).columns))
df["cancellation_policy"] = df["cancellation_policy"].apply(lambda x: dummyize(x, pd.get_dummies(df['cancellation_policy']).columns))
df["host_response_time"] = df["host_response_time"].apply(lambda x: dummyize(x, pd.get_dummies(df['host_response_time']).columns))
In [ ]:
#See dummies of those room types
df["room_type"].unique()
Out[ ]:
array([1, 0, 2])
In [ ]:
#Get unique room types
df["cancellation_policy"].unique()
Out[ ]:
array([1, 3, 0, 4, 2])
In [ ]:
#Get rid of the percentage sign and covert the response rate into floats
df['host_response_rate'] = df['host_response_rate'].str.replace('%', '').astype('float')
In [ ]:
#Convert the categorical variables to type category so they can be used in
#logistic regression models without issue
for column in ['host_location', 'host_is_superhost', 'host_has_profile_pic', \
'host_identity_verified', 'instant_bookable']:
df[column] = df[column].astype('category')
In [ ]:
#Convert host_since to datetime type
df['host_since'] = pd.to_datetime(df['host_since'])
#Get the year from host_since and convert it to a int since datetime
#can't be used in sci-kit learn models.
#Only have a column for the year as a float since the day or month probably
#won't be useful as their own columns
#Convert to float first because converting directly to int causes all the years
#to become 1970
df['host_since'] = df['host_since'].dt.year.astype('float').astype('int')
In [ ]:
#Check resulting dataframe
df
Out[ ]:
| host_since | host_location | host_response_time | host_response_rate | host_is_superhost | host_listings_count | host_has_profile_pic | host_identity_verified | property_type | room_type | ... | number_of_reviews | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | cancellation_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 2014 | 1 | 3 | 100.0 | 0 | 1.0 | 1 | 1 | 9 | 1 | ... | 2 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 1 | 1 |
| 9 | 2012 | 1 | 3 | 100.0 | 1 | 1.0 | 1 | 1 | 9 | 1 | ... | 20 | 99.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 1 |
| 11 | 2011 | 1 | 3 | 100.0 | 0 | 1.0 | 1 | 1 | 9 | 1 | ... | 9 | 93.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 3 |
| 12 | 2013 | 1 | 3 | 97.0 | 1 | 9.0 | 1 | 1 | 9 | 1 | ... | 4 | 100.0 | 9.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 3 |
| 13 | 2013 | 1 | 3 | 97.0 | 1 | 9.0 | 1 | 1 | 9 | 1 | ... | 7 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5826 | 2013 | 1 | 2 | 100.0 | 0 | 339.0 | 1 | 1 | 0 | 0 | ... | 12 | 85.0 | 9.0 | 9.0 | 9.0 | 9.0 | 10.0 | 8.0 | 0 | 3 |
| 5827 | 2011 | 1 | 2 | 87.0 | 0 | 19.0 | 1 | 1 | 0 | 0 | ... | 13 | 94.0 | 9.0 | 9.0 | 10.0 | 9.0 | 10.0 | 9.0 | 0 | 3 |
| 5829 | 2012 | 1 | 2 | 95.0 | 0 | 11.0 | 1 | 0 | 0 | 0 | ... | 13 | 88.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 1 |
| 5830 | 2013 | 1 | 2 | 100.0 | 1 | 21.0 | 1 | 1 | 0 | 0 | ... | 9 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 9.0 | 0 | 3 |
| 5832 | 2014 | 0 | 3 | 100.0 | 0 | 5.0 | 1 | 0 | 9 | 0 | ... | 1 | 100.0 | 8.0 | 10.0 | 10.0 | 10.0 | 10.0 | 8.0 | 1 | 3 |
3372 rows × 36 columns
In [ ]:
#Get number of nulls in each column
df.isnull().sum()
Out[ ]:
host_since 0 host_location 0 host_response_time 0 host_response_rate 0 host_is_superhost 0 host_listings_count 0 host_has_profile_pic 0 host_identity_verified 0 property_type 0 room_type 0 accommodates 0 bathrooms 0 bedrooms 0 beds 0 bed_type 0 price 0 weekly_price 0 security_deposit 0 cleaning_fee 0 guests_included 0 extra_people 0 minimum_nights 0 availability_30 0 availability_60 0 availability_90 0 availability_365 0 number_of_reviews 0 review_scores_rating 0 review_scores_accuracy 0 review_scores_cleanliness 0 review_scores_checkin 0 review_scores_communication 0 review_scores_location 0 review_scores_value 0 instant_bookable 0 cancellation_policy 0 dtype: int64
In [ ]:
#See data types in data
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3372 entries, 7 to 5832 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_since 3372 non-null int64 1 host_location 3372 non-null category 2 host_response_time 3372 non-null int64 3 host_response_rate 3372 non-null float64 4 host_is_superhost 3372 non-null category 5 host_listings_count 3372 non-null float64 6 host_has_profile_pic 3372 non-null category 7 host_identity_verified 3372 non-null category 8 property_type 3372 non-null int64 9 room_type 3372 non-null int64 10 accommodates 3372 non-null int64 11 bathrooms 3372 non-null float64 12 bedrooms 3372 non-null float64 13 beds 3372 non-null float64 14 bed_type 3372 non-null int64 15 price 3372 non-null float64 16 weekly_price 3372 non-null float64 17 security_deposit 3372 non-null float64 18 cleaning_fee 3372 non-null float64 19 guests_included 3372 non-null int64 20 extra_people 3372 non-null float64 21 minimum_nights 3372 non-null int64 22 availability_30 3372 non-null int64 23 availability_60 3372 non-null int64 24 availability_90 3372 non-null int64 25 availability_365 3372 non-null int64 26 number_of_reviews 3372 non-null int64 27 review_scores_rating 3372 non-null float64 28 review_scores_accuracy 3372 non-null float64 29 review_scores_cleanliness 3372 non-null float64 30 review_scores_checkin 3372 non-null float64 31 review_scores_communication 3372 non-null float64 32 review_scores_location 3372 non-null float64 33 review_scores_value 3372 non-null float64 34 instant_bookable 3372 non-null category 35 cancellation_policy 3372 non-null int64 dtypes: category(5), float64(17), int64(14) memory usage: 860.1 KB
Problem 1¶
In [ ]:
Problem 2¶
Preprocessing for problem¶
In [ ]:
#Drop availability variables except availability_90
df_prob_2 = df.drop(['availability_30', 'availability_60', 'availability_365'], axis = 1)
In [ ]:
#Make availability_90 into a percentage
print(df_prob_2['availability_90'])
df_prob_2['availability_90'] = df_prob_2['availability_90'] / 90
print(df_prob_2['availability_90'])
7 37
9 89
11 84
12 85
13 89
..
5826 80
5827 76
5829 73
5830 65
5832 89
Name: availability_90, Length: 3372, dtype: int64
7 0.411111
9 0.988889
11 0.933333
12 0.944444
13 0.988889
...
5826 0.888889
5827 0.844444
5829 0.811111
5830 0.722222
5832 0.988889
Name: availability_90, Length: 3372, dtype: float64
In [ ]:
def is_booked(percentage):
if percentage < 0.4:
return 1
else:
return 0
df_prob_2["availability_90"] = df_prob_2["availability_90"].apply(lambda x: is_booked(x))
df_prob_2["availability_90"].head(60)
Out[ ]:
7 0 9 0 11 0 12 0 13 0 14 0 16 0 19 1 23 0 26 0 27 0 28 0 29 0 34 0 35 0 37 0 39 1 40 0 44 0 46 0 50 0 51 0 53 0 56 0 57 0 58 0 60 0 62 0 63 0 65 0 66 0 67 0 68 0 71 0 72 0 75 0 76 0 77 0 83 0 85 0 86 1 87 0 88 0 89 0 93 0 94 1 96 1 100 0 101 1 102 0 103 0 104 0 107 0 108 0 110 0 111 0 113 1 114 0 115 0 116 0 Name: availability_90, dtype: int64
In [ ]:
#Drop number of reviews since common sense tells us that popularity
#will be closely related to how booked an AirBnB is and the models
#will heavily train on it if it is left in for this problem
df_prob_2.drop('number_of_reviews', axis=1, inplace=True)
In [ ]:
#Make availability_90 of type category so it can be used in training models
df_prob_2['availability_90'] = df_prob_2['availability_90'].astype('category')
In [ ]:
df_prob_2_sampled = df_prob_2.groupby('availability_90').apply(lambda s: s.sample(500))
In [ ]:
#Split the data on what the model is learning to predict, whether an AirBnB will be booked
X = df_prob_2_sampled.drop('availability_90', axis=1)
y = df_prob_2_sampled['availability_90']
#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)
X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 700 entries, (0, 57) to (1, 2090) Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_since 700 non-null int64 1 host_location 700 non-null category 2 host_response_time 700 non-null int64 3 host_response_rate 700 non-null float64 4 host_is_superhost 700 non-null category 5 host_listings_count 700 non-null float64 6 host_has_profile_pic 700 non-null category 7 host_identity_verified 700 non-null category 8 property_type 700 non-null int64 9 room_type 700 non-null int64 10 accommodates 700 non-null int64 11 bathrooms 700 non-null float64 12 bedrooms 700 non-null float64 13 beds 700 non-null float64 14 bed_type 700 non-null int64 15 price 700 non-null float64 16 weekly_price 700 non-null float64 17 security_deposit 700 non-null float64 18 cleaning_fee 700 non-null float64 19 guests_included 700 non-null int64 20 extra_people 700 non-null float64 21 minimum_nights 700 non-null int64 22 review_scores_rating 700 non-null float64 23 review_scores_accuracy 700 non-null float64 24 review_scores_cleanliness 700 non-null float64 25 review_scores_checkin 700 non-null float64 26 review_scores_communication 700 non-null float64 27 review_scores_location 700 non-null float64 28 review_scores_value 700 non-null float64 29 instant_bookable 700 non-null category 30 cancellation_policy 700 non-null int64 dtypes: category(5), float64(17), int64(9) memory usage: 188.6 KB <class 'pandas.core.series.Series'> MultiIndex: 700 entries, (0, 57) to (1, 2090) Series name: availability_90 Non-Null Count Dtype -------------- ----- 700 non-null category dtypes: category(1) memory usage: 43.2 KB
Create And Assess Decision Tree Classifiers¶
Default Tree¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_2 = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)
# fit the model to the training data
dt_prob_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_2, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default_prob_2")
Out[ ]:
'decision_tree_default_prob_2.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_2,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c37c3af0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_2.predict(X_train)
y_pred_test = dt_prob_2.predict(X_test)
y_prob_train = dt_prob_2.predict_proba(X_train)
y_prob_test = dt_prob_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.7586 Precision: 0.7514 Recall. : 0.7749 -- test set -- Accuracy : 0.5433 Precision: 0.5357 Recall. : 0.6040
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[259 79] [ 90 272]] [[73 59] [78 90]]
In [ ]:
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_prob_2.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 15 0.186640 price 5 0.120284 host_listings_count 18 0.089013 cleaning_fee 2 0.078193 host_response_time 20 0.076561 extra_people 17 0.063001 security_deposit 9 0.049921 room_type 13 0.043169 beds 22 0.043145 review_scores_rating 3 0.038342 host_response_rate 0 0.033662 host_since 27 0.029936 review_scores_location 28 0.021485 review_scores_value 11 0.019586 bathrooms 19 0.018006 guests_included 7 0.017736 host_identity_verified 4 0.016144 host_is_superhost 29 0.014918 instant_bookable 16 0.011435 weekly_price 21 0.010926 minimum_nights 8 0.009458 property_type 12 0.008442 bedrooms 6 0.000000 host_has_profile_pic 23 0.000000 review_scores_accuracy 24 0.000000 review_scores_cleanliness 25 0.000000 review_scores_checkin 26 0.000000 review_scores_communication 1 0.000000 host_location 14 0.000000 bed_type 10 0.000000 accommodates 30 0.000000 cancellation_policy
Tuned Tree¶
In [ ]:
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the training set so we have "tuned" parameters or values
dt_tune_prob_2 = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune_prob_2, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
print(best_estimator)
{'ccp_alpha': 0, 'max_depth': None, 'min_samples_leaf': 100}
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=100)
In [ ]:
# create an instance of a decision tree classifier using "tuned" values
dt_tuned_prob_2 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=100, ccp_alpha = 0)
# fit the model to the training data
dt_tuned_prob_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=100)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_prob_2, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_prob_2")
Out[ ]:
'decision_tree_tuned_prob_2.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_prob_2,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c2a185e0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned_prob_2.predict(X_train)
y_pred_test = dt_tuned_prob_2.predict(X_test)
y_prob_train = dt_tuned_prob_2.predict_proba(X_train)
y_prob_test = dt_tuned_prob_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.6100 Precision: 0.6154 Recall. : 0.5926 -- test set -- Accuracy : 0.5767 Precision: 0.5786 Recall. : 0.5436
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[219 143] [130 208]] [[92 68] [59 81]]
In [ ]:
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_tuned_prob_2.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 5 0.457687 host_listings_count 20 0.410974 extra_people 8 0.112919 property_type 17 0.018419 security_deposit 0 0.000000 host_since 29 0.000000 instant_bookable 28 0.000000 review_scores_value 27 0.000000 review_scores_location 26 0.000000 review_scores_communication 25 0.000000 review_scores_checkin 24 0.000000 review_scores_cleanliness 23 0.000000 review_scores_accuracy 22 0.000000 review_scores_rating 21 0.000000 minimum_nights 19 0.000000 guests_included 18 0.000000 cleaning_fee 15 0.000000 price 16 0.000000 weekly_price 1 0.000000 host_location 14 0.000000 bed_type 13 0.000000 beds 12 0.000000 bedrooms 11 0.000000 bathrooms 10 0.000000 accommodates 9 0.000000 room_type 7 0.000000 host_identity_verified 6 0.000000 host_has_profile_pic 4 0.000000 host_is_superhost 3 0.000000 host_response_rate 2 0.000000 host_response_time 30 0.000000 cancellation_policy
Tree With Lower Min¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_2_2 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=50, ccp_alpha = 0)
# fit the model to the training data
dt_prob_2_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=50)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=50)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_2_2, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_lower_min_prob_2")
Out[ ]:
'decision_tree_lower_min_prob_2.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_2_2,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c3852950>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_2_2.predict(X_train)
y_pred_test = dt_prob_2_2.predict(X_test)
y_prob_train = dt_prob_2_2.predict_proba(X_train)
y_prob_test = dt_prob_2_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.6229 Precision: 0.6570 Recall. : 0.5185 -- test set -- Accuracy : 0.5933 Precision: 0.6063 Recall. : 0.5168
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[254 169] [ 95 182]] [[101 72] [ 50 77]]
In [ ]:
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_prob_2_2.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 5 0.269645 host_listings_count 20 0.242124 extra_people 15 0.202653 price 22 0.131451 review_scores_rating 28 0.078247 review_scores_value 9 0.075881 room_type 17 0.000000 security_deposit 29 0.000000 instant_bookable 27 0.000000 review_scores_location 26 0.000000 review_scores_communication 25 0.000000 review_scores_checkin 24 0.000000 review_scores_cleanliness 23 0.000000 review_scores_accuracy 21 0.000000 minimum_nights 19 0.000000 guests_included 18 0.000000 cleaning_fee 0 0.000000 host_since 16 0.000000 weekly_price 1 0.000000 host_location 14 0.000000 bed_type 13 0.000000 beds 12 0.000000 bedrooms 11 0.000000 bathrooms 10 0.000000 accommodates 8 0.000000 property_type 7 0.000000 host_identity_verified 6 0.000000 host_has_profile_pic 4 0.000000 host_is_superhost 3 0.000000 host_response_rate 2 0.000000 host_response_time 30 0.000000 cancellation_policy
Tree With Even Lower Min¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_2_3 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=30, ccp_alpha = 0)
# fit the model to the training data
dt_prob_2_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=30)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=30)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_2_3, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_less_complexity_prob_2")
Out[ ]:
'decision_tree_less_complexity_prob_2.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_2_3,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c3d9bee0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_2_3.predict(X_train)
y_pred_test = dt_prob_2_3.predict(X_test)
y_prob_train = dt_prob_2_3.predict_proba(X_train)
y_prob_test = dt_prob_2_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.6586 Precision: 0.6505 Recall. : 0.6895 -- test set -- Accuracy : 0.5933 Precision: 0.5808 Recall. : 0.6510
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[219 109] [130 242]] [[81 52] [70 97]]
In [ ]:
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_prob_2_3.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 5 0.225162 host_listings_count 20 0.162890 extra_people 15 0.107165 price 0 0.086502 host_since 22 0.085147 review_scores_rating 18 0.085131 cleaning_fee 3 0.084799 host_response_rate 28 0.052641 review_scores_value 9 0.051049 room_type 13 0.030899 beds 10 0.028615 accommodates 4 0.000000 host_is_superhost 2 0.000000 host_response_time 29 0.000000 instant_bookable 27 0.000000 review_scores_location 26 0.000000 review_scores_communication 25 0.000000 review_scores_checkin 24 0.000000 review_scores_cleanliness 23 0.000000 review_scores_accuracy 21 0.000000 minimum_nights 17 0.000000 security_deposit 19 0.000000 guests_included 16 0.000000 weekly_price 1 0.000000 host_location 14 0.000000 bed_type 12 0.000000 bedrooms 11 0.000000 bathrooms 8 0.000000 property_type 7 0.000000 host_identity_verified 6 0.000000 host_has_profile_pic 30 0.000000 cancellation_policy
Create And Assess Logistic Regression Models¶
Full Logistic¶
In [ ]:
#Fit full logistic regression model to examine the significance of the terms
X = X_train
X = sm.add_constant(X)
y=y_train
logit_full_prob_2 = sm.Logit(y, X).fit()
#Print summary
print(logit_full_prob_2.summary())
Optimization terminated successfully.
Current function value: 0.656205
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: availability_90 No. Observations: 700
Model: Logit Df Residuals: 669
Method: MLE Df Model: 30
Date: Wed, 13 Dec 2023 Pseudo R-squ.: 0.05329
Time: 22:46:20 Log-Likelihood: -459.34
converged: True LL-Null: -485.20
Covariance Type: nonrobust LLR p-value: 0.008177
===============================================================================================
coef std err z P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------------------
const -14.5786 4.55e+15 -3.2e-15 1.000 -8.92e+15 8.92e+15
host_since 0.0122 0.063 0.194 0.846 -0.111 0.135
host_location 0.3165 0.254 1.247 0.212 -0.181 0.814
host_response_time -0.0875 0.123 -0.713 0.476 -0.328 0.153
host_response_rate 0.0153 0.008 1.814 0.070 -0.001 0.032
host_is_superhost -0.2970 0.224 -1.327 0.184 -0.736 0.142
host_listings_count -0.0011 0.002 -0.713 0.476 -0.004 0.002
host_has_profile_pic -14.5786 4.55e+15 -3.2e-15 1.000 -8.92e+15 8.92e+15
host_identity_verified -0.3349 0.203 -1.646 0.100 -0.734 0.064
property_type -0.0347 0.021 -1.621 0.105 -0.077 0.007
room_type -0.5357 0.220 -2.430 0.015 -0.968 -0.104
accommodates -0.0652 0.060 -1.091 0.275 -0.182 0.052
bathrooms -0.0465 0.192 -0.242 0.809 -0.423 0.330
bedrooms 0.2715 0.150 1.807 0.071 -0.023 0.566
beds -0.1236 0.098 -1.266 0.206 -0.315 0.068
bed_type -0.1606 0.172 -0.932 0.351 -0.498 0.177
price 0.0023 0.002 0.925 0.355 -0.003 0.007
weekly_price -0.0004 0.000 -0.993 0.320 -0.001 0.000
security_deposit -3.572e-06 0.000 -0.014 0.989 -0.001 0.001
cleaning_fee -0.0005 0.002 -0.221 0.825 -0.005 0.004
guests_included -0.0863 0.062 -1.403 0.161 -0.207 0.034
extra_people -0.0024 0.003 -0.771 0.441 -0.009 0.004
minimum_nights 0.0299 0.029 1.038 0.299 -0.026 0.086
review_scores_rating -0.0162 0.029 -0.565 0.572 -0.073 0.040
review_scores_accuracy 0.1096 0.309 0.355 0.723 -0.496 0.715
review_scores_cleanliness 0.1237 0.127 0.974 0.330 -0.125 0.373
review_scores_checkin -0.1337 nan nan nan nan nan
review_scores_communication 0.4864 nan nan nan nan nan
review_scores_location 0.0066 0.091 0.073 0.942 -0.172 0.185
review_scores_value 0.0583 0.143 0.409 0.683 -0.221 0.338
instant_bookable 0.1178 0.262 0.449 0.653 -0.396 0.632
cancellation_policy -0.0068 0.069 -0.098 0.922 -0.142 0.128
===============================================================================================
In [ ]:
# Generate predicted values for training set
pprob = logit_full_prob_2.predict(X)
# Create predicted category for success using 50% cutoff
psuccess = (pprob > 0.5).astype(int)
# Add new variables to the training data set
X_train['p_success'] = psuccess
X_train['p_prob'] = pprob
X_train['y'] = y_train
X_train.info()
# Generate predicted values for test set
X_test = sm.add_constant(X_test)
pprob_test = logit_full_prob_2.predict(X_test)
# Create predicted category for success using 50% cutoff
psuccess_test = (pprob_test > 0.5).astype(int)
# Add new variables to the response data set
X_test['p_success'] = psuccess_test
X_test['p_prob'] = pprob_test
X_test.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 700 entries, (0, 57) to (1, 2090) Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_since 700 non-null int64 1 host_location 700 non-null category 2 host_response_time 700 non-null int64 3 host_response_rate 700 non-null float64 4 host_is_superhost 700 non-null category 5 host_listings_count 700 non-null float64 6 host_has_profile_pic 700 non-null category 7 host_identity_verified 700 non-null category 8 property_type 700 non-null int64 9 room_type 700 non-null int64 10 accommodates 700 non-null int64 11 bathrooms 700 non-null float64 12 bedrooms 700 non-null float64 13 beds 700 non-null float64 14 bed_type 700 non-null int64 15 price 700 non-null float64 16 weekly_price 700 non-null float64 17 security_deposit 700 non-null float64 18 cleaning_fee 700 non-null float64 19 guests_included 700 non-null int64 20 extra_people 700 non-null float64 21 minimum_nights 700 non-null int64 22 review_scores_rating 700 non-null float64 23 review_scores_accuracy 700 non-null float64 24 review_scores_cleanliness 700 non-null float64 25 review_scores_checkin 700 non-null float64 26 review_scores_communication 700 non-null float64 27 review_scores_location 700 non-null float64 28 review_scores_value 700 non-null float64 29 instant_bookable 700 non-null category 30 cancellation_policy 700 non-null int64 31 p_success 700 non-null int64 32 p_prob 700 non-null float64 33 y 700 non-null category dtypes: category(6), float64(18), int64(10) memory usage: 200.4 KB <class 'pandas.core.frame.DataFrame'> MultiIndex: 300 entries, (0, 5415) to (0, 1953) Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 const 300 non-null float64 1 host_since 300 non-null int64 2 host_location 300 non-null category 3 host_response_time 300 non-null int64 4 host_response_rate 300 non-null float64 5 host_is_superhost 300 non-null category 6 host_listings_count 300 non-null float64 7 host_has_profile_pic 300 non-null category 8 host_identity_verified 300 non-null category 9 property_type 300 non-null int64 10 room_type 300 non-null int64 11 accommodates 300 non-null int64 12 bathrooms 300 non-null float64 13 bedrooms 300 non-null float64 14 beds 300 non-null float64 15 bed_type 300 non-null int64 16 price 300 non-null float64 17 weekly_price 300 non-null float64 18 security_deposit 300 non-null float64 19 cleaning_fee 300 non-null float64 20 guests_included 300 non-null int64 21 extra_people 300 non-null float64 22 minimum_nights 300 non-null int64 23 review_scores_rating 300 non-null float64 24 review_scores_accuracy 300 non-null float64 25 review_scores_cleanliness 300 non-null float64 26 review_scores_checkin 300 non-null float64 27 review_scores_communication 300 non-null float64 28 review_scores_location 300 non-null float64 29 review_scores_value 300 non-null float64 30 instant_bookable 300 non-null category 31 cancellation_policy 300 non-null int64 32 p_success 300 non-null int64 33 p_prob 300 non-null float64 dtypes: category(5), float64(19), int64(10) memory usage: 111.3 KB
In [ ]:
# Generate confusion matrix for training set
conf_matrix = confusion_matrix(psuccess, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(psuccess_test, y_test)
print(conf_matrix)
[[209 127] [140 224]] [[93 59] [58 90]]
In [ ]:
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, pprob)
roc_auc = roc_auc_score(y_train.cat.codes, pprob)
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, pprob_test)
roc_auc = roc_auc_score(y_test.cat.codes, pprob_test)
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, psuccess)
prec_train = precision_score(y_train, psuccess)
rec_train = recall_score(y_train, psuccess)
# print the scores for the training set
print("Accuracy (Train) : {:.4f}".format(acc_train))
print("Precision (Train): {:.4f}".format(prec_train))
print("Recall (Train) : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, psuccess_test)
prec_test = precision_score(y_test, psuccess_test)
rec_test = recall_score(y_test, psuccess_test)
# print the scores for the testing set
print("Accuracy (Test) : {:.4f}".format(acc_test))
print("Precision (Test): {:.4f}".format(prec_test))
print("Recall (Test) : {:.4f}".format(rec_test))
print("")
Accuracy (Train) : 0.6186 Precision (Train): 0.6154 Recall (Train) : 0.6382 Accuracy (Test) : 0.6100 Precision (Test): 0.6081 Recall (Test) : 0.6040
Create And Assess The LASSO and Ridge Regression Models¶
Fix Training And Testing Data¶
In [ ]:
#Fix the training and testing data by removing the columns that were
#added by the full logistic regression model
X.drop('const', axis=1, inplace=True)
X_train.drop(['p_success', 'p_prob', 'y'], axis=1, inplace=True)
X_test.drop(['const', 'p_success', 'p_prob'], axis=1, inplace=True)
Create¶
In [ ]:
# Create an Instance of Logistic Regression for LASSO Selection using c = 0.1 and c = 0.01
lr_l1_1_prob_2 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
lr_l1_01_prob_2 = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
# fit the models to the training data
lr_l1_1_prob_2.fit(X_train, y_train)
lr_l1_01_prob_2.fit(X_train, y_train)
# Create an Instance of Logistic Regression for LASSO Selection using c = 1 and c = 0.7
lr_l1_10_prob_2 = LogisticRegression(penalty='l1', solver='liblinear', C=1)
lr_l1_7_prob_2 = LogisticRegression(penalty='l1', solver='liblinear', C=0.7)
# fit the models to the training data
lr_l1_10_prob_2.fit(X_train, y_train)
lr_l1_7_prob_2.fit(X_train, y_train)
# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2_prob_2 = LogisticRegression(penalty='l2', solver='liblinear')
# fit the models to the training data
lr_l2_prob_2.fit(X_train, y_train)
Out[ ]:
LogisticRegression(solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(solver='liblinear')
Analyze The Importance Of Different Categories In The Models¶
In [ ]:
# function for model coefficients
def rpt_model_variables(model):
# Get the intercept term
intercept = model.intercept_
# Access the coefficients (weights) of the model, i rounded them
coefficients = np.round(model.coef_[0],decimals=4)
# Create DataFrames for intercept and coefficients
#df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
df_coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': coefficients})
df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)
# if you want to add intercept to table
#df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)
# Print the DataFrame
print(df_coefficients)
return df_coefficients
#Evalute the model coefficients for the models
print("Lasso C=0.1")
df_coefficients1 = rpt_model_variables(lr_l1_1_prob_2)
print("")
print("Lasso C=0.01")
df_coefficients01 = rpt_model_variables(lr_l1_01_prob_2)
print("")
print("Lasso C=1")
df_coefficients10 = rpt_model_variables(lr_l1_10_prob_2)
print("")
print("Lasso C=0.7")
df_coefficients7 = rpt_model_variables(lr_l1_7_prob_2)
print("")
print("Ridge Regression")
df_coefficients2 = rpt_model_variables(lr_l2_prob_2)
Lasso C=0.1
feature coefficient abs_coefficient
9 room_type -0.1566 0.1566
13 beds -0.0629 0.0629
7 host_identity_verified -0.0564 0.0564
19 guests_included -0.0482 0.0482
8 property_type -0.0395 0.0395
10 accommodates -0.0223 0.0223
21 minimum_nights 0.0177 0.0177
12 bedrooms 0.0167 0.0167
3 host_response_rate 0.0126 0.0126
22 review_scores_rating 0.0061 0.0061
20 extra_people -0.0036 0.0036
15 price 0.0025 0.0025
5 host_listings_count -0.0010 0.0010
2 host_response_time -0.0009 0.0009
0 host_since -0.0006 0.0006
16 weekly_price -0.0004 0.0004
1 host_location 0.0000 0.0000
17 security_deposit 0.0000 0.0000
18 cleaning_fee 0.0000 0.0000
14 bed_type 0.0000 0.0000
11 bathrooms 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
23 review_scores_accuracy 0.0000 0.0000
24 review_scores_cleanliness 0.0000 0.0000
25 review_scores_checkin 0.0000 0.0000
26 review_scores_communication 0.0000 0.0000
27 review_scores_location 0.0000 0.0000
28 review_scores_value 0.0000 0.0000
29 instant_bookable 0.0000 0.0000
30 cancellation_policy 0.0000 0.0000
Lasso C=0.01
feature coefficient abs_coefficient
8 property_type -0.0158 0.0158
3 host_response_rate 0.0077 0.0077
20 extra_people -0.0043 0.0043
15 price 0.0017 0.0017
18 cleaning_fee -0.0015 0.0015
5 host_listings_count -0.0006 0.0006
16 weekly_price -0.0003 0.0003
0 host_since -0.0002 0.0002
17 security_deposit 0.0001 0.0001
7 host_identity_verified 0.0000 0.0000
2 host_response_time 0.0000 0.0000
29 instant_bookable 0.0000 0.0000
28 review_scores_value 0.0000 0.0000
27 review_scores_location 0.0000 0.0000
26 review_scores_communication 0.0000 0.0000
25 review_scores_checkin 0.0000 0.0000
24 review_scores_cleanliness 0.0000 0.0000
23 review_scores_accuracy 0.0000 0.0000
22 review_scores_rating 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
19 guests_included 0.0000 0.0000
9 room_type 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
1 host_location 0.0000 0.0000
14 bed_type 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
30 cancellation_policy 0.0000 0.0000
Lasso C=1
feature coefficient abs_coefficient
9 room_type -0.4856 0.4856
26 review_scores_communication 0.3969 0.3969
7 host_identity_verified -0.3062 0.3062
4 host_is_superhost -0.2645 0.2645
12 bedrooms 0.2355 0.2355
1 host_location 0.2241 0.2241
13 beds -0.1213 0.1213
14 bed_type -0.1122 0.1122
24 review_scores_cleanliness 0.0991 0.0991
19 guests_included -0.0808 0.0808
2 host_response_time -0.0699 0.0699
23 review_scores_accuracy 0.0691 0.0691
25 review_scores_checkin -0.0622 0.0622
10 accommodates -0.0579 0.0579
28 review_scores_value 0.0471 0.0471
29 instant_bookable 0.0390 0.0390
8 property_type -0.0360 0.0360
21 minimum_nights 0.0270 0.0270
11 bathrooms -0.0243 0.0243
3 host_response_rate 0.0150 0.0150
22 review_scores_rating -0.0114 0.0114
30 cancellation_policy -0.0098 0.0098
20 extra_people -0.0025 0.0025
15 price 0.0022 0.0022
0 host_since -0.0021 0.0021
5 host_listings_count -0.0010 0.0010
18 cleaning_fee -0.0005 0.0005
16 weekly_price -0.0003 0.0003
17 security_deposit -0.0000 0.0000
27 review_scores_location 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
Lasso C=0.7
feature coefficient abs_coefficient
9 room_type -0.4652 0.4652
26 review_scores_communication 0.3591 0.3591
7 host_identity_verified -0.2932 0.2932
4 host_is_superhost -0.2486 0.2486
12 bedrooms 0.2200 0.2200
1 host_location 0.1872 0.1872
13 beds -0.1207 0.1207
14 bed_type -0.0912 0.0912
24 review_scores_cleanliness 0.0892 0.0892
19 guests_included -0.0782 0.0782
2 host_response_time -0.0629 0.0629
10 accommodates -0.0545 0.0545
23 review_scores_accuracy 0.0522 0.0522
28 review_scores_value 0.0410 0.0410
8 property_type -0.0363 0.0363
25 review_scores_checkin -0.0318 0.0318
21 minimum_nights 0.0262 0.0262
11 bathrooms -0.0150 0.0150
3 host_response_rate 0.0149 0.0149
30 cancellation_policy -0.0100 0.0100
22 review_scores_rating -0.0095 0.0095
29 instant_bookable 0.0043 0.0043
20 extra_people -0.0026 0.0026
15 price 0.0023 0.0023
0 host_since -0.0021 0.0021
5 host_listings_count -0.0010 0.0010
18 cleaning_fee -0.0005 0.0005
16 weekly_price -0.0003 0.0003
17 security_deposit -0.0000 0.0000
27 review_scores_location 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
Ridge Regression
feature coefficient abs_coefficient
9 room_type -0.5083 0.5083
26 review_scores_communication 0.4477 0.4477
7 host_identity_verified -0.3260 0.3260
4 host_is_superhost -0.2901 0.2901
1 host_location 0.2870 0.2870
12 bedrooms 0.2671 0.2671
14 bed_type -0.1475 0.1475
13 beds -0.1237 0.1237
25 review_scores_checkin -0.1192 0.1192
24 review_scores_cleanliness 0.1168 0.1168
29 instant_bookable 0.1120 0.1120
23 review_scores_accuracy 0.1037 0.1037
2 host_response_time -0.0859 0.0859
19 guests_included -0.0858 0.0858
10 accommodates -0.0627 0.0627
28 review_scores_value 0.0591 0.0591
11 bathrooms -0.0489 0.0489
8 property_type -0.0358 0.0358
21 minimum_nights 0.0291 0.0291
3 host_response_rate 0.0154 0.0154
22 review_scores_rating -0.0153 0.0153
30 cancellation_policy -0.0103 0.0103
27 review_scores_location 0.0080 0.0080
20 extra_people -0.0025 0.0025
0 host_since -0.0022 0.0022
15 price 0.0022 0.0022
5 host_listings_count -0.0011 0.0011
6 host_has_profile_pic -0.0006 0.0006
18 cleaning_fee -0.0005 0.0005
16 weekly_price -0.0003 0.0003
17 security_deposit -0.0000 0.0000
In [ ]:
# plot variable importance
# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp(df_coef):
# determine the variables the model is using and create df
# of their absolute coefficients
df_plt = df_coef[df_coef['abs_coefficient'] != 0]
# determine the variables the model is not using
reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()
# bar graph of the absolute coefficients that the model is using
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
sns.barplot(data=df_plt,
y=df_plt['feature'],
x=df_plt['abs_coefficient'], color="lightblue")
plt.show()
# print the variables the model is not using after the bar graph
print("-- rejected --")
for i in reject_vars:
print(f" {i}")
# plot the variable importance for the models
print("Lasso C=0.1")
plot_variable_imp(df_coefficients1)
print("")
print("Lasso C=0.01")
plot_variable_imp(df_coefficients01)
print("")
print("Lasso C=1")
plot_variable_imp(df_coefficients10)
print("")
print("Lasso C=0.7")
plot_variable_imp(df_coefficients7)
print("")
print("Ridge Regression")
plot_variable_imp(df_coefficients2)
Lasso C=0.1
-- rejected -- host_location security_deposit cleaning_fee bed_type bathrooms host_has_profile_pic host_is_superhost review_scores_accuracy review_scores_cleanliness review_scores_checkin review_scores_communication review_scores_location review_scores_value instant_bookable cancellation_policy Lasso C=0.01
-- rejected -- host_identity_verified host_response_time instant_bookable review_scores_value review_scores_location review_scores_communication review_scores_checkin review_scores_cleanliness review_scores_accuracy review_scores_rating minimum_nights guests_included room_type host_is_superhost host_has_profile_pic host_location bed_type beds bedrooms bathrooms accommodates cancellation_policy Lasso C=1
-- rejected -- security_deposit review_scores_location host_has_profile_pic Lasso C=0.7
-- rejected -- security_deposit review_scores_location host_has_profile_pic Ridge Regression
-- rejected -- security_deposit
Make Predictions To Evaluate The Models¶
In [ ]:
# make predictions on the training and testing data for all of the models to
# evaluate the models
#Lasso C=0.1
y_pred_train = lr_l1_1_prob_2.predict(X_train)
y_pred_test = lr_l1_1_prob_2.predict(X_test)
y_proba_train = lr_l1_1_prob_2.predict_proba(X_train)
y_proba_test = lr_l1_1_prob_2.predict_proba(X_test)
#Lasso C=0.01
y_pred_train1 = lr_l1_01_prob_2.predict(X_train)
y_pred_test1 = lr_l1_01_prob_2.predict(X_test)
y_proba_train1 = lr_l1_01_prob_2.predict_proba(X_train)
y_proba_test1 = lr_l1_01_prob_2.predict_proba(X_test)
#Lasso C=1
y_pred_train10 = lr_l1_10_prob_2.predict(X_train)
y_pred_test10 = lr_l1_10_prob_2.predict(X_test)
y_proba_train10 = lr_l1_10_prob_2.predict_proba(X_train)
y_proba_test10 = lr_l1_10_prob_2.predict_proba(X_test)
#Lasso C=0.7
y_pred_train7 = lr_l1_7_prob_2.predict(X_train)
y_pred_test7 = lr_l1_7_prob_2.predict(X_test)
y_proba_train7 = lr_l1_7_prob_2.predict_proba(X_train)
y_proba_test7 = lr_l1_7_prob_2.predict_proba(X_test)
#Ridge Regression
y_pred_train2 = lr_l2_prob_2.predict(X_train)
y_pred_test2 = lr_l2_prob_2.predict(X_test)
y_proba_train2 = lr_l2_prob_2.predict_proba(X_train)
y_proba_test2 = lr_l2_prob_2.predict_proba(X_test)
Evaluate The Models¶
L1 with c=0.1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train = accuracy_score(y_train, y_pred_train)
prec2_train = precision_score(y_train, y_pred_train)
rec2_train = recall_score(y_train, y_pred_train)
auc2_train = roc_auc_score(y_train, y_proba_train[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train))
print("Precision: {:.4f}".format(prec2_train))
print("Recall. : {:.4f}".format(rec2_train))
print("AUC : {:.4f}".format(auc2_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test = accuracy_score(y_test, y_pred_test)
prec2_test = precision_score(y_test, y_pred_test)
rec2_test = recall_score(y_test, y_pred_test)
auc2_test = roc_auc_score(y_test, y_proba_test[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test))
print("Precision: {:.4f}".format(prec2_test))
print("Recall. : {:.4f}".format(rec2_test))
print("AUC : {:.4f}".format(auc2_test))
-- train set -- Accuracy : 0.5743 Precision: 0.5714 Recall. : 0.6040 AUC : 0.6181 -- test set -- Accuracy : 0.5667 Precision: 0.5590 Recall. : 0.6040 AUC : 0.6036
L1 with c=0.01¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train1 = accuracy_score(y_train, y_pred_train1)
prec2_train1 = precision_score(y_train, y_pred_train1)
rec2_train1 = recall_score(y_train, y_pred_train1)
auc2_train1 = roc_auc_score(y_train, y_proba_train1[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train1))
print("Precision: {:.4f}".format(prec2_train1))
print("Recall. : {:.4f}".format(rec2_train1))
print("AUC : {:.4f}".format(auc2_train1))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test1 = accuracy_score(y_test, y_pred_test1)
prec2_test1 = precision_score(y_test, y_pred_test1)
rec2_test1 = recall_score(y_test, y_pred_test1)
auc2_test1 = roc_auc_score(y_test, y_proba_test1[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test1))
print("Precision: {:.4f}".format(prec2_test1))
print("Recall. : {:.4f}".format(rec2_test1))
print("AUC : {:.4f}".format(auc2_test1))
-- train set -- Accuracy : 0.5471 Precision: 0.5390 Recall. : 0.6695 AUC : 0.5952 -- test set -- Accuracy : 0.5433 Precision: 0.5337 Recall. : 0.6376 AUC : 0.5717
L1 with C=1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train10 = accuracy_score(y_train, y_pred_train10)
prec2_train10 = precision_score(y_train, y_pred_train10)
rec2_train10 = recall_score(y_train, y_pred_train10)
auc2_train10 = roc_auc_score(y_train, y_proba_train10[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train10))
print("Precision: {:.4f}".format(prec2_train10))
print("Recall. : {:.4f}".format(rec2_train10))
print("AUC : {:.4f}".format(auc2_train10))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test10 = accuracy_score(y_test, y_pred_test10)
prec2_test10 = precision_score(y_test, y_pred_test10)
rec2_test10 = recall_score(y_test, y_pred_test10)
auc2_test10 = roc_auc_score(y_test, y_proba_test10[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test10))
print("Precision: {:.4f}".format(prec2_test10))
print("Recall. : {:.4f}".format(rec2_test10))
print("AUC : {:.4f}".format(auc2_test10))
-- train set -- Accuracy : 0.6143 Precision: 0.6116 Recall. : 0.6325 AUC : 0.6523 -- test set -- Accuracy : 0.6000 Precision: 0.5973 Recall. : 0.5973 AUC : 0.6206
L1 with C=0.7¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train7 = accuracy_score(y_train, y_pred_train7)
prec2_train7 = precision_score(y_train, y_pred_train7)
rec2_train7 = recall_score(y_train, y_pred_train7)
auc2_train7 = roc_auc_score(y_train, y_proba_train7[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train7))
print("Precision: {:.4f}".format(prec2_train7))
print("Recall. : {:.4f}".format(rec2_train7))
print("AUC : {:.4f}".format(auc2_train7))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test7 = accuracy_score(y_test, y_pred_test7)
prec2_test7 = precision_score(y_test, y_pred_test7)
rec2_test7 = recall_score(y_test, y_pred_test7)
auc2_test7 = roc_auc_score(y_test, y_proba_test7[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test7))
print("Precision: {:.4f}".format(prec2_test7))
print("Recall. : {:.4f}".format(rec2_test7))
print("AUC : {:.4f}".format(auc2_test7))
-- train set -- Accuracy : 0.6143 Precision: 0.6122 Recall. : 0.6296 AUC : 0.6519 -- test set -- Accuracy : 0.5900 Precision: 0.5878 Recall. : 0.5839 AUC : 0.6223
L2 Regularization¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train2 = accuracy_score(y_train, y_pred_train2)
prec2_train2 = precision_score(y_train, y_pred_train2)
rec2_train2 = recall_score(y_train, y_pred_train2)
auc2_train2 = roc_auc_score(y_train, y_proba_train2[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train2))
print("Precision: {:.4f}".format(prec2_train2))
print("Recall. : {:.4f}".format(rec2_train2))
print("AUC : {:.4f}".format(auc2_train2))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test2 = accuracy_score(y_test, y_pred_test2)
prec2_test2 = precision_score(y_test, y_pred_test2)
rec2_test2 = recall_score(y_test, y_pred_test2)
auc2_test2 = roc_auc_score(y_test, y_proba_test2[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test2))
print("Precision: {:.4f}".format(prec2_test2))
print("Recall. : {:.4f}".format(rec2_test2))
print("AUC : {:.4f}".format(auc2_test2))
-- train set -- Accuracy : 0.6171 Precision: 0.6150 Recall. : 0.6325 AUC : 0.6534 -- test set -- Accuracy : 0.6033 Precision: 0.6000 Recall. : 0.6040 AUC : 0.6186
Problem 3¶
Linear Regression¶
In [ ]:
df_prob_3 = df.drop(['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'], axis=1)
df_prob_3 = df_prob_3[df_prob_3['review_scores_rating'] >= 80]
In [ ]:
# Display summary statistics
print(df_prob_3.describe())
# Compute correlations
cormat = df_prob_3.corr()
# Round correlation matrix to 2 decimal places
cormat = cormat.round(2)
# Plot correlation matrix using a heatmap
plt.figure(figsize = (20,16))
sns.heatmap(cormat,annot=True, cmap = 'coolwarm')
plt.show()
host_since host_response_time host_response_rate \
count 3320.000000 3320.000000 3320.000000
mean 2013.023494 2.281024 94.146988
std 1.466526 0.799745 12.624938
min 2008.000000 0.000000 13.000000
25% 2012.000000 2.000000 93.000000
50% 2013.000000 2.000000 100.000000
75% 2014.000000 3.000000 100.000000
max 2015.000000 3.000000 100.000000
host_listings_count property_type room_type accommodates \
count 3320.000000 3320.000000 3320.000000 3320.000000
mean 15.385241 6.401506 0.310241 4.437952
std 63.969021 4.235170 0.502015 2.812790
min 1.000000 0.000000 0.000000 1.000000
25% 1.000000 0.000000 0.000000 2.000000
50% 1.000000 9.000000 0.000000 4.000000
75% 2.000000 9.000000 1.000000 6.000000
max 339.000000 17.000000 2.000000 16.000000
bathrooms bedrooms beds ... guests_included \
count 3320.000000 3320.000000 3320.000000 ... 3320.000000
mean 1.426054 1.696988 2.260843 ... 2.068373
std 0.715703 1.153635 1.740650 ... 1.865101
min 0.000000 0.000000 1.000000 ... 0.000000
25% 1.000000 1.000000 1.000000 ... 1.000000
50% 1.000000 1.000000 2.000000 ... 1.000000
75% 2.000000 2.000000 3.000000 ... 2.000000
max 7.000000 10.000000 16.000000 ... 16.000000
extra_people minimum_nights availability_30 availability_60 \
count 3320.000000 3320.000000 3320.000000 3320.000000
mean 14.991566 1.965361 16.756928 37.440964
std 28.168219 2.688678 10.891157 20.918877
min 0.000000 1.000000 0.000000 0.000000
25% 0.000000 1.000000 6.750000 21.000000
50% 0.000000 2.000000 19.000000 44.000000
75% 25.000000 2.000000 28.000000 57.000000
max 500.000000 60.000000 30.000000 60.000000
availability_90 availability_365 number_of_reviews \
count 3320.000000 3320.000000 3320.000000
mean 59.978916 273.629819 18.436446
std 30.595223 114.416397 31.446002
min 0.000000 0.000000 1.000000
25% 39.000000 218.750000 3.000000
50% 71.000000 332.000000 7.000000
75% 87.000000 359.000000 20.000000
max 90.000000 365.000000 314.000000
review_scores_rating cancellation_policy
count 3320.000000 3320.000000
mean 96.025301 1.739759
std 5.117595 1.295826
min 80.000000 0.000000
25% 94.000000 1.000000
50% 98.000000 1.000000
75% 100.000000 3.000000
max 100.000000 4.000000
[8 rows x 25 columns]
<ipython-input-974-bbed5e760c05>:5: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. cormat = df_prob_3.corr()
In [ ]:
# Select only numerical columns
numerical_columns = df_prob_3.select_dtypes(include=[np.number])
# Display the numerical variables
numerical_columns.head()
Out[ ]:
| host_since | host_response_time | host_response_rate | host_listings_count | property_type | room_type | accommodates | bathrooms | bedrooms | beds | ... | guests_included | extra_people | minimum_nights | availability_30 | availability_60 | availability_90 | availability_365 | number_of_reviews | review_scores_rating | cancellation_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 2014 | 3 | 100.0 | 1.0 | 9 | 1 | 2 | 1.0 | 1.0 | 1.0 | ... | 2 | 10.0 | 1 | 16 | 16 | 37 | 312 | 2 | 100.0 | 1 |
| 9 | 2012 | 3 | 100.0 | 1.0 | 9 | 1 | 2 | 1.0 | 1.0 | 1.0 | ... | 2 | 19.0 | 1 | 29 | 59 | 89 | 364 | 20 | 99.0 | 1 |
| 11 | 2011 | 3 | 100.0 | 1.0 | 9 | 1 | 2 | 1.0 | 1.0 | 1.0 | ... | 1 | 10.0 | 3 | 24 | 54 | 84 | 84 | 9 | 93.0 | 3 |
| 12 | 2013 | 3 | 97.0 | 9.0 | 9 | 1 | 2 | 1.0 | 1.0 | 1.0 | ... | 1 | 0.0 | 2 | 25 | 55 | 85 | 360 | 4 | 100.0 | 3 |
| 13 | 2013 | 3 | 97.0 | 9.0 | 9 | 1 | 2 | 1.0 | 1.0 | 1.0 | ... | 1 | 20.0 | 3 | 29 | 59 | 89 | 364 | 7 | 100.0 | 3 |
5 rows × 25 columns
Initial Model
In [ ]:
# Perform linear regression
numerical_columns = df_prob_3.select_dtypes(include=[np.number])
X = numerical_columns.drop(columns=['review_scores_rating'])
y = numerical_columns['review_scores_rating']
X = sm.add_constant(X)
reg1_prob_3 = sm.OLS(y, X).fit()
# Display regression summary
print(reg1_prob_3.summary())
OLS Regression Results
================================================================================
Dep. Variable: review_scores_rating R-squared: 0.052
Model: OLS Adj. R-squared: 0.045
Method: Least Squares F-statistic: 7.571
Date: Wed, 13 Dec 2023 Prob (F-statistic): 2.24e-25
Time: 22:46:26 Log-Likelihood: -10042.
No. Observations: 3320 AIC: 2.013e+04
Df Residuals: 3295 BIC: 2.029e+04
Df Model: 24
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const 73.3358 129.518 0.566 0.571 -180.609 327.280
host_since 0.0101 0.064 0.157 0.875 -0.116 0.136
host_response_time -0.0911 0.133 -0.685 0.493 -0.352 0.170
host_response_rate 0.0326 0.008 3.889 0.000 0.016 0.049
host_listings_count -0.0133 0.002 -8.416 0.000 -0.016 -0.010
property_type 0.0979 0.022 4.351 0.000 0.054 0.142
room_type -0.7683 0.221 -3.478 0.001 -1.201 -0.335
accommodates -0.1693 0.067 -2.521 0.012 -0.301 -0.038
bathrooms 0.3326 0.204 1.634 0.102 -0.066 0.732
bedrooms -0.0307 0.156 -0.197 0.844 -0.336 0.274
beds -0.0482 0.096 -0.500 0.617 -0.237 0.141
bed_type -0.1196 0.169 -0.710 0.478 -0.450 0.211
price -0.0009 0.002 -0.446 0.656 -0.005 0.003
weekly_price 0.0003 0.000 0.970 0.332 -0.000 0.001
security_deposit 0.0003 0.000 1.029 0.304 -0.000 0.001
cleaning_fee 0.0044 0.002 1.827 0.068 -0.000 0.009
guests_included 0.0786 0.063 1.248 0.212 -0.045 0.202
extra_people -0.0001 0.004 -0.032 0.974 -0.007 0.007
minimum_nights 0.0247 0.033 0.749 0.454 -0.040 0.089
availability_30 0.0307 0.024 1.280 0.201 -0.016 0.078
availability_60 -0.0409 0.022 -1.822 0.069 -0.085 0.003
availability_90 0.0129 0.012 1.112 0.266 -0.010 0.036
availability_365 -0.0004 0.001 -0.349 0.727 -0.002 0.002
number_of_reviews 5.566e-07 0.003 0.000 1.000 -0.006 0.006
cancellation_policy -0.1007 0.077 -1.308 0.191 -0.252 0.050
==============================================================================
Omnibus: 910.589 Durbin-Watson: 1.981
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2064.947
Skew: -1.556 Prob(JB): 0.00
Kurtosis: 5.290 Cond. No. 4.02e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.02e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
vifres = pd.DataFrame()
vifres["Variable"] = X.columns
vifres["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vifres)
Variable VIF 0 const 2.227561e+06 1 host_since 1.181043e+00 2 host_response_time 1.501849e+00 3 host_response_rate 1.486876e+00 4 host_listings_count 1.366618e+00 5 property_type 1.204378e+00 6 room_type 1.632806e+00 7 accommodates 4.735134e+00 8 bathrooms 2.816620e+00 9 bedrooms 4.274533e+00 10 beds 3.740522e+00 11 bed_type 1.113456e+00 12 price 2.972996e+01 13 weekly_price 2.878216e+01 14 security_deposit 1.575646e+00 15 cleaning_fee 2.371857e+00 16 guests_included 1.829933e+00 17 extra_people 1.333160e+00 18 minimum_nights 1.043525e+00 19 availability_30 9.065570e+00 20 availability_60 2.921048e+01 21 availability_90 1.671801e+01 22 availability_365 1.855702e+00 23 number_of_reviews 1.182531e+00 24 cancellation_policy 1.320730e+00
In [ ]:
def stepwise_selection(X, y,
initial_list=[],
threshold_in=0.01,
threshold_out = 0.05,
verbose=True):
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print(f'Add {best_feature} with p-value {best_pval:.4f}')
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.idxmax()
included.remove(worst_feature)
if verbose:
print(f'Drop {worst_feature} with p-value {worst_pval:.4f}')
if not changed:
break
return included
In [ ]:
selected_features = stepwise_selection(X, y)
print('resulting features:')
print(selected_features)
Add const with p-value 0.0000
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. new_pval = pd.Series(index=excluded) <ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. new_pval = pd.Series(index=excluded)
Add host_listings_count with p-value 0.0000 Add cleaning_fee with p-value 0.0000
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. new_pval = pd.Series(index=excluded) <ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. new_pval = pd.Series(index=excluded)
Add host_response_rate with p-value 0.0000 Add property_type with p-value 0.0001
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. new_pval = pd.Series(index=excluded) <ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. new_pval = pd.Series(index=excluded)
Add room_type with p-value 0.0026 resulting features: ['const', 'host_listings_count', 'cleaning_fee', 'host_response_rate', 'property_type', 'room_type']
<ipython-input-978-27ab6f6e2725>:12: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. new_pval = pd.Series(index=excluded)
In [ ]:
# Fit stepwise regression
X = numerical_columns[['host_listings_count', 'cleaning_fee', 'host_response_rate', 'property_type', 'room_type']]
X = sm.add_constant(X)
stepreg_prob_3 = sm.OLS(y, X).fit()
# Display regression summary
print(stepreg_prob_3.summary())
OLS Regression Results
================================================================================
Dep. Variable: review_scores_rating R-squared: 0.043
Model: OLS Adj. R-squared: 0.041
Method: Least Squares F-statistic: 29.58
Date: Wed, 13 Dec 2023 Prob (F-statistic): 1.73e-29
Time: 22:46:28 Log-Likelihood: -10058.
No. Observations: 3320 AIC: 2.013e+04
Df Residuals: 3314 BIC: 2.017e+04
Df Model: 5
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const 92.6527 0.668 138.745 0.000 91.343 93.962
host_listings_count -0.0144 0.001 -10.017 0.000 -0.017 -0.012
cleaning_fee 0.0050 0.002 2.687 0.007 0.001 0.009
host_response_rate 0.0313 0.007 4.515 0.000 0.018 0.045
property_type 0.0896 0.021 4.240 0.000 0.048 0.131
room_type -0.5831 0.194 -3.010 0.003 -0.963 -0.203
==============================================================================
Omnibus: 888.450 Durbin-Watson: 1.981
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1957.894
Skew: -1.535 Prob(JB): 0.00
Kurtosis: 5.173 Cond. No. 879.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
vifresstep = pd.DataFrame()
vifresstep["Variable"] = X.columns
vifresstep["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vifresstep)
Variable VIF 0 const 58.965066 1 host_listings_count 1.119284 2 cleaning_fee 1.415161 3 host_response_rate 1.012429 4 property_type 1.059833 5 room_type 1.250513
In [ ]:
# Calculate residuals
residuals = stepreg_prob_3.resid
# Generate Q-Q Plot
fig = sm.qqplot(residuals,fit=True, line='45')
plt.show()
# Residuals vs. Fitted
plt.figure(figsize=(10, 6))
plt.scatter(stepreg_prob_3.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--') # Add a horizontal line at y=0
plt.grid(True)
plt.show()
In [ ]:
# log transformed regression
X = numerical_columns[['host_listings_count', 'cleaning_fee', 'host_response_rate', 'property_type', 'room_type']]
X = sm.add_constant(X)
logy = np.log(y)
logreg_prob_3 = sm.OLS(logy, X).fit()
# Display regression summary
print(logreg_prob_3.summary())
OLS Regression Results
================================================================================
Dep. Variable: review_scores_rating R-squared: 0.043
Model: OLS Adj. R-squared: 0.041
Method: Least Squares F-statistic: 29.48
Date: Wed, 13 Dec 2023 Prob (F-statistic): 2.22e-29
Time: 22:46:29 Log-Likelihood: 4936.7
No. Observations: 3320 AIC: -9861.
Df Residuals: 3314 BIC: -9825.
Df Model: 5
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const 4.5264 0.007 620.361 0.000 4.512 4.541
host_listings_count -0.0002 1.57e-05 -10.002 0.000 -0.000 -0.000
cleaning_fee 5.229e-05 2.05e-05 2.555 0.011 1.22e-05 9.24e-05
host_response_rate 0.0003 7.57e-05 4.515 0.000 0.000 0.000
property_type 0.0010 0.000 4.258 0.000 0.001 0.001
room_type -0.0065 0.002 -3.047 0.002 -0.011 -0.002
==============================================================================
Omnibus: 1012.580 Durbin-Watson: 1.982
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2560.565
Skew: -1.670 Prob(JB): 0.00
Kurtosis: 5.711 Cond. No. 879.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# Calculate residuals
residuals = logreg_prob_3.resid
# Generate Q-Q Plot
fig = sm.qqplot(residuals, fit=True, line='45')
plt.show()
# Residuals vs. Fitted
plt.figure(figsize=(10, 6))
plt.scatter(logreg_prob_3.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--') # Add a horizontal line at y=0
plt.grid(True)
plt.show()
In [ ]:
# square root transformed regression
X = numerical_columns[['host_listings_count', 'cleaning_fee', 'host_response_rate', 'property_type', 'room_type']]
X = sm.add_constant(X)
sqrty = np.sqrt(y)
sqrtreg_prob_3 = sm.OLS(logy, X).fit()
# Display regression summary
print(sqrtreg_prob_3.summary())
OLS Regression Results
================================================================================
Dep. Variable: review_scores_rating R-squared: 0.043
Model: OLS Adj. R-squared: 0.041
Method: Least Squares F-statistic: 29.48
Date: Wed, 13 Dec 2023 Prob (F-statistic): 2.22e-29
Time: 22:46:29 Log-Likelihood: 4936.7
No. Observations: 3320 AIC: -9861.
Df Residuals: 3314 BIC: -9825.
Df Model: 5
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const 4.5264 0.007 620.361 0.000 4.512 4.541
host_listings_count -0.0002 1.57e-05 -10.002 0.000 -0.000 -0.000
cleaning_fee 5.229e-05 2.05e-05 2.555 0.011 1.22e-05 9.24e-05
host_response_rate 0.0003 7.57e-05 4.515 0.000 0.000 0.000
property_type 0.0010 0.000 4.258 0.000 0.001 0.001
room_type -0.0065 0.002 -3.047 0.002 -0.011 -0.002
==============================================================================
Omnibus: 1012.580 Durbin-Watson: 1.982
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2560.565
Skew: -1.670 Prob(JB): 0.00
Kurtosis: 5.711 Cond. No. 879.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# Calculate residuals
residuals = sqrtreg_prob_3.resid
# Generate Q-Q Plot
fig = sm.qqplot(residuals, fit=True, line='45')
plt.show()
# Residuals vs. Fitted
plt.figure(figsize=(10, 6))
plt.scatter(sqrtreg_prob_3.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--') # Add a horizontal line at y=0
plt.grid(True)
plt.show()
Classifiers ("5" Catgories)¶
In [ ]:
def categorize_review_scores(score):
if score < 20:
return 0
elif score >= 20 and score < 40:
return 1
elif score >= 40 and score < 60:
return 2
elif score >= 60 and score < 80:
return 3
else:
return 4
In [ ]:
df_prob_3_2 = df.drop(['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'], axis=1)
df_prob_3_2['review_scores_rating'] = df_prob_3_2['review_scores_rating'].apply(lambda x: categorize_review_scores(x)).astype('category')
df_prob_3_2['review_scores_rating'].head(60)
Out[ ]:
7 4 9 4 11 4 12 4 13 4 14 4 16 4 19 4 23 4 26 4 27 4 28 4 29 3 34 4 35 4 37 4 39 4 40 4 44 4 46 4 50 4 51 4 53 4 56 4 57 4 58 4 60 4 62 4 63 4 65 4 66 4 67 4 68 4 71 4 72 4 75 4 76 4 77 4 83 4 85 4 86 4 87 4 88 4 89 4 93 4 94 4 96 4 100 4 101 4 102 4 103 4 104 4 107 4 108 4 110 4 111 4 113 4 114 4 115 4 116 4 Name: review_scores_rating, dtype: category Categories (4, int64): [1, 2, 3, 4]
In [ ]:
#No reviews are below 20
df_prob_3_2[df_prob_3_2['review_scores_rating'] == 0]
Out[ ]:
| host_since | host_location | host_response_time | host_response_rate | host_is_superhost | host_listings_count | host_has_profile_pic | host_identity_verified | property_type | room_type | ... | extra_people | minimum_nights | availability_30 | availability_60 | availability_90 | availability_365 | number_of_reviews | review_scores_rating | instant_bookable | cancellation_policy |
|---|
0 rows × 30 columns
In [ ]:
df_prob_3_2.groupby('review_scores_rating').count()
Out[ ]:
| host_since | host_location | host_response_time | host_response_rate | host_is_superhost | host_listings_count | host_has_profile_pic | host_identity_verified | property_type | room_type | ... | guests_included | extra_people | minimum_nights | availability_30 | availability_60 | availability_90 | availability_365 | number_of_reviews | instant_bookable | cancellation_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| review_scores_rating | |||||||||||||||||||||
| 1 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | ... | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |
| 2 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | ... | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| 3 | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 43 | ... | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 43 |
| 4 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | ... | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 | 3320 |
4 rows × 29 columns
In [ ]:
df_prob_3_sampled = df_prob_3_2.groupby('review_scores_rating').apply(lambda s: s.sample(min(len(s), 30)))
In [ ]:
#Split the data on what the model is learning to predict, whether an AirBnB will be booked
X = df_prob_3_sampled.drop('review_scores_rating', axis=1)
y = df_prob_3_sampled['review_scores_rating']
#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)
X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 48 entries, (2, 2696) to (3, 3665) Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_since 48 non-null int64 1 host_location 48 non-null category 2 host_response_time 48 non-null int64 3 host_response_rate 48 non-null float64 4 host_is_superhost 48 non-null category 5 host_listings_count 48 non-null float64 6 host_has_profile_pic 48 non-null category 7 host_identity_verified 48 non-null category 8 property_type 48 non-null int64 9 room_type 48 non-null int64 10 accommodates 48 non-null int64 11 bathrooms 48 non-null float64 12 bedrooms 48 non-null float64 13 beds 48 non-null float64 14 bed_type 48 non-null int64 15 price 48 non-null float64 16 weekly_price 48 non-null float64 17 security_deposit 48 non-null float64 18 cleaning_fee 48 non-null float64 19 guests_included 48 non-null int64 20 extra_people 48 non-null float64 21 minimum_nights 48 non-null int64 22 availability_30 48 non-null int64 23 availability_60 48 non-null int64 24 availability_90 48 non-null int64 25 availability_365 48 non-null int64 26 number_of_reviews 48 non-null int64 27 instant_bookable 48 non-null category 28 cancellation_policy 48 non-null int64 dtypes: category(5), float64(10), int64(14) memory usage: 12.9 KB <class 'pandas.core.series.Series'> MultiIndex: 48 entries, (2, 2696) to (3, 3665) Series name: review_scores_rating Non-Null Count Dtype -------------- ----- 48 non-null category dtypes: category(1) memory usage: 3.3 KB
Create And Assess Decision Tree Classifiers¶
Default Tree¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_3 = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)
# fit the model to the training data
dt_prob_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3, filled=True, rounded=True, feature_names=X.columns, class_names=['1','2','3','4'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default_prob_3")
Out[ ]:
'decision_tree_default_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3,
'tree.dot',
class_names=['1','2','3','4'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c4041d50>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3.predict(X_train)
y_pred_test = dt_prob_3.predict(X_test)
y_prob_train = dt_prob_3.predict_proba(X_train)
y_prob_test = dt_prob_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.6458 Precision: ['0.0000', '0.0000', '0.6061', '0.7333'] Recall: ['0.0000', '0.0000', '0.8333', '0.6471'] -- test set -- Accuracy : 0.5238 Precision: ['0.0000', '0.0000', '0.3000', '0.7273'] Recall: ['0.0000', '0.0000', '0.5000', '0.6154']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 0 0 0 0] [ 0 0 0 0] [ 4 3 20 6] [ 0 0 4 11]] [[0 0 0 0] [0 0 0 0] [1 1 3 5] [0 0 3 8]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape # (n_samples, n_classes)
Out[ ]:
(21, 4)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['1', '2', '3', '4']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score: 0.80
In [ ]:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score: 0.70
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))
plt.plot(
fpr["micro"],
tpr["micro"],
label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
color="deeppink",
linestyle=":",
linewidth=4,
)
plt.plot(
fpr["macro"],
tpr["macro"],
label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
color="navy",
linestyle=":",
linewidth=4,
)
colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen"])
for class_id, color in zip(range(n_classes), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_prob_test[:, class_id],
name=f"ROC curve for {n_names[class_id]}",
color=color,
ax=ax,
#plot_chance_level=(class_id == 2),
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 26 0.808137 number_of_reviews 23 0.191863 availability_60 0 0.000000 host_since 15 0.000000 price 27 0.000000 instant_bookable 25 0.000000 availability_365 24 0.000000 availability_90 22 0.000000 availability_30 21 0.000000 minimum_nights 20 0.000000 extra_people 19 0.000000 guests_included 18 0.000000 cleaning_fee 17 0.000000 security_deposit 16 0.000000 weekly_price 14 0.000000 bed_type 1 0.000000 host_location 13 0.000000 beds 12 0.000000 bedrooms 11 0.000000 bathrooms 10 0.000000 accommodates 9 0.000000 room_type 8 0.000000 property_type 7 0.000000 host_identity_verified 6 0.000000 host_has_profile_pic 5 0.000000 host_listings_count 4 0.000000 host_is_superhost 3 0.000000 host_response_rate 2 0.000000 host_response_time 28 0.000000 cancellation_policy
Tuned Tree¶
In [ ]:
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the training set so we have "tuned" parameters or values
dt_tune_prob_3 = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune_prob_3, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
print(best_estimator)
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py:700: UserWarning: The least populated class in y has only 3 members, which is less than n_splits=5. warnings.warn(
{'ccp_alpha': 0, 'max_depth': None, 'min_samples_leaf': 10}
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=10)
In [ ]:
# create an instance of a decision tree classifier using "tuned" values
dt_tuned_prob_3 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=10, ccp_alpha = 0)
# fit the model to the training data
dt_tuned_prob_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_prob_3, filled=True, rounded=True, feature_names=X.columns, class_names=['1', '2', '3', '4'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_prob_3")
Out[ ]:
'decision_tree_tuned_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_prob_3,
'tree.dot',
class_names=['1', '2', '3', '4'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c3a1fe80>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned_prob_3.predict(X_train)
y_pred_test = dt_tuned_prob_3.predict(X_test)
y_prob_train = dt_tuned_prob_3.predict_proba(X_train)
y_prob_test = dt_tuned_prob_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.6458 Precision: ['0.0000', '0.0000', '0.6061', '0.7333'] Recall: ['0.0000', '0.0000', '0.8333', '0.6471'] -- test set -- Accuracy : 0.5238 Precision: ['0.0000', '0.0000', '0.3000', '0.7273'] Recall: ['0.0000', '0.0000', '0.5000', '0.6154']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 0 0 0 0] [ 0 0 0 0] [ 4 3 20 6] [ 0 0 4 11]] [[0 0 0 0] [0 0 0 0] [1 1 3 5] [0 0 3 8]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape # (n_samples, n_classes)
Out[ ]:
(21, 4)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['1', '2', '3', '4']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score: 0.80
In [ ]:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score: 0.70
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))
plt.plot(
fpr["micro"],
tpr["micro"],
label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
color="deeppink",
linestyle=":",
linewidth=4,
)
plt.plot(
fpr["macro"],
tpr["macro"],
label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
color="navy",
linestyle=":",
linewidth=4,
)
colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen"])
for class_id, color in zip(range(n_classes), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_prob_test[:, class_id],
name=f"ROC curve for {n_names[class_id]}",
color=color,
ax=ax,
#plot_chance_level=(class_id == 2),
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_tuned_prob_3.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 26 0.808137 number_of_reviews 23 0.191863 availability_60 0 0.000000 host_since 15 0.000000 price 27 0.000000 instant_bookable 25 0.000000 availability_365 24 0.000000 availability_90 22 0.000000 availability_30 21 0.000000 minimum_nights 20 0.000000 extra_people 19 0.000000 guests_included 18 0.000000 cleaning_fee 17 0.000000 security_deposit 16 0.000000 weekly_price 14 0.000000 bed_type 1 0.000000 host_location 13 0.000000 beds 12 0.000000 bedrooms 11 0.000000 bathrooms 10 0.000000 accommodates 9 0.000000 room_type 8 0.000000 property_type 7 0.000000 host_identity_verified 6 0.000000 host_has_profile_pic 5 0.000000 host_listings_count 4 0.000000 host_is_superhost 3 0.000000 host_response_rate 2 0.000000 host_response_time 28 0.000000 cancellation_policy
Tree With No Min¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_3_2 = DecisionTreeClassifier(max_depth = None, min_samples_leaf=1, ccp_alpha = 0)
# fit the model to the training data
dt_prob_3_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3_2, filled=True, rounded=True, feature_names=X.columns, class_names=['1', '2', '3', '4'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_no_min_prob_3")
Out[ ]:
'decision_tree_no_min_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3_2,
'tree.dot',
class_names=['1','2', '3', '4'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c288e0b0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3_2.predict(X_train)
y_pred_test = dt_prob_3_2.predict(X_test)
y_prob_train = dt_prob_3_2.predict_proba(X_train)
y_prob_test = dt_prob_3_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 1.0000 Precision: ['1.0000', '1.0000', '1.0000', '1.0000'] Recall: ['1.0000', '1.0000', '1.0000', '1.0000'] -- test set -- Accuracy : 0.4286 Precision: ['0.0000', '0.0000', '0.2000', '0.7000'] Recall: ['0.0000', '0.0000', '0.3333', '0.5385']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 4 0 0 0] [ 0 3 0 0] [ 0 0 24 0] [ 0 0 0 17]] [[0 0 1 0] [0 0 0 0] [1 1 2 6] [0 0 3 7]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape # (n_samples, n_classes)
Out[ ]:
(21, 4)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['1', '2', '3', '4']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score: 0.62
In [ ]:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score: 0.49
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))
plt.plot(
fpr["micro"],
tpr["micro"],
label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
color="deeppink",
linestyle=":",
linewidth=4,
)
plt.plot(
fpr["macro"],
tpr["macro"],
label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
color="navy",
linestyle=":",
linewidth=4,
)
colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen"])
for class_id, color in zip(range(n_classes), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_prob_test[:, class_id],
name=f"ROC curve for {n_names[class_id]}",
color=color,
ax=ax,
#plot_chance_level=(class_id == 2),
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3_2.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 26 0.209155 number_of_reviews 25 0.113154 availability_365 0 0.108305 host_since 5 0.086512 host_listings_count 11 0.062132 bathrooms 8 0.061720 property_type 7 0.061669 host_identity_verified 27 0.060280 instant_bookable 19 0.050919 guests_included 2 0.050368 host_response_time 18 0.045262 cleaning_fee 10 0.045262 accommodates 22 0.045262 availability_30 24 0.000000 availability_90 23 0.000000 availability_60 21 0.000000 minimum_nights 20 0.000000 extra_people 14 0.000000 bed_type 17 0.000000 security_deposit 16 0.000000 weekly_price 15 0.000000 price 1 0.000000 host_location 13 0.000000 beds 12 0.000000 bedrooms 9 0.000000 room_type 6 0.000000 host_has_profile_pic 4 0.000000 host_is_superhost 3 0.000000 host_response_rate 28 0.000000 cancellation_policy
Tree With No Min, Less Depth¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_3_3 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf=1, ccp_alpha = 0)
# fit the model to the training data
dt_prob_3_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3_3, filled=True, rounded=True, feature_names=X.columns, class_names=['1','2','3','4'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_no_min_less_depth_prob_3")
Out[ ]:
'decision_tree_no_min_less_depth_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3_3,
'tree.dot',
class_names=['1','2','3','4'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c3a1c3a0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3_3.predict(X_train)
y_pred_test = dt_prob_3_3.predict(X_test)
y_prob_train = dt_prob_3_3.predict_proba(X_train)
y_prob_test = dt_prob_3_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 1.0000 Precision: ['1.0000', '1.0000', '1.0000', '1.0000'] Recall: ['1.0000', '1.0000', '1.0000', '1.0000'] -- test set -- Accuracy : 0.3333 Precision: ['0.0000', '0.0000', '0.1818', '0.6250'] Recall: ['0.0000', '0.0000', '0.3333', '0.3846']
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 4 0 0 0] [ 0 3 0 0] [ 0 0 24 0] [ 0 0 0 17]] [[0 0 1 0] [0 0 0 1] [1 1 2 7] [0 0 3 5]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape # (n_samples, n_classes)
Out[ ]:
(21, 4)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['1', '2', '3', '4']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score: 0.56
In [ ]:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score: 0.46
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))
plt.plot(
fpr["micro"],
tpr["micro"],
label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
color="deeppink",
linestyle=":",
linewidth=4,
)
plt.plot(
fpr["macro"],
tpr["macro"],
label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
color="navy",
linestyle=":",
linewidth=4,
)
colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen"])
for class_id, color in zip(range(n_classes), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_prob_test[:, class_id],
name=f"ROC curve for {n_names[class_id]}",
color=color,
ax=ax,
#plot_chance_level=(class_id == 2),
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3_3.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 26 0.254417 number_of_reviews 27 0.122001 instant_bookable 25 0.090523 availability_365 5 0.086512 host_listings_count 13 0.067893 beds 0 0.063043 host_since 11 0.062132 bathrooms 7 0.061669 host_identity_verified 20 0.050919 extra_people 2 0.050368 host_response_time 15 0.045262 price 10 0.045262 accommodates 21 0.000000 minimum_nights 23 0.000000 availability_60 22 0.000000 availability_30 18 0.000000 cleaning_fee 24 0.000000 availability_90 19 0.000000 guests_included 14 0.000000 bed_type 17 0.000000 security_deposit 16 0.000000 weekly_price 1 0.000000 host_location 12 0.000000 bedrooms 9 0.000000 room_type 8 0.000000 property_type 6 0.000000 host_has_profile_pic 4 0.000000 host_is_superhost 3 0.000000 host_response_rate 28 0.000000 cancellation_policy
Create And Assess Logistic Regression Models¶
Full Logistic¶
In [ ]:
# define the multinomial logistic regression model
logistic_model_prob_3 = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# fit the model on the training data
logistic_model_prob_3.fit(X_train, y_train)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression(multi_class='multinomial')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial')
Create The LASSO and Ridge Regression Models¶
In [ ]:
# Create an Instance of Logistic Regression for LASSO Selection using c = 0.1 and c = 0.01
lr_l1_1_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.1)
lr_l1_01_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.01)
# fit the models to the training data
lr_l1_1_prob_3.fit(X_train, y_train)
lr_l1_01_prob_3.fit(X_train, y_train)
# Create an Instance of Logistic Regression for LASSO Selection using c = 1 and c = 0.7
lr_l1_10_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=1)
lr_l1_7_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.7)
# fit the models to the training data
lr_l1_10_prob_3.fit(X_train, y_train)
lr_l1_7_prob_3.fit(X_train, y_train)
# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2_prob_3 = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2')
# fit the models to the training data
lr_l2_prob_3.fit(X_train, y_train)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression(multi_class='multinomial')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial')
Analyze The Importance Of Different Categories In The Models¶
In [ ]:
# function for model coefficients
def rpt_model_variables(model):
# Get the intercept term
intercept = model.intercept_
# Access the coefficients (weights) of the model, i rounded them
coefficients = np.round(model.coef_[0],decimals=4)
# Create DataFrames for intercept and coefficients
#df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
df_coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': coefficients})
df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)
# if you want to add intercept to table
#df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)
# Print the DataFrame
print(df_coefficients)
return df_coefficients
#Evalute the model coefficients for the models
print("Full Logistic Regression Model")
df_coefficients_full = rpt_model_variables(logistic_model_prob_3)
print("Lasso C=0.1")
df_coefficients1 = rpt_model_variables(lr_l1_1_prob_3)
print("")
print("Lasso C=0.01")
df_coefficients01 = rpt_model_variables(lr_l1_01_prob_3)
print("")
print("Lasso C=1")
df_coefficients10 = rpt_model_variables(lr_l1_10_prob_3)
print("")
print("Lasso C=0.7")
df_coefficients7 = rpt_model_variables(lr_l1_7_prob_3)
print("")
print("Ridge Regression")
df_coefficients2 = rpt_model_variables(lr_l2_prob_3)
Full Logistic Regression Model
feature coefficient abs_coefficient
17 security_deposit -0.0642 0.0642
5 host_listings_count 0.0251 0.0251
25 availability_365 -0.0205 0.0205
3 host_response_rate 0.0128 0.0128
23 availability_60 0.0113 0.0113
15 price -0.0108 0.0108
26 number_of_reviews -0.0093 0.0093
20 extra_people 0.0088 0.0088
24 availability_90 0.0087 0.0087
8 property_type -0.0061 0.0061
0 host_since 0.0026 0.0026
10 accommodates 0.0014 0.0014
19 guests_included 0.0013 0.0013
27 instant_bookable 0.0012 0.0012
16 weekly_price 0.0011 0.0011
2 host_response_time 0.0010 0.0010
12 bedrooms 0.0007 0.0007
9 room_type 0.0007 0.0007
13 beds -0.0006 0.0006
14 bed_type 0.0005 0.0005
18 cleaning_fee -0.0004 0.0004
21 minimum_nights 0.0004 0.0004
28 cancellation_policy 0.0004 0.0004
11 bathrooms 0.0003 0.0003
7 host_identity_verified 0.0003 0.0003
22 availability_30 -0.0002 0.0002
4 host_is_superhost -0.0002 0.0002
1 host_location 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
Lasso C=0.1
feature coefficient abs_coefficient
17 security_deposit -0.0016 0.0016
5 host_listings_count 0.0011 0.0011
0 host_since -0.0005 0.0005
15 price -0.0002 0.0002
16 weekly_price 0.0002 0.0002
25 availability_365 -0.0001 0.0001
18 cleaning_fee 0.0001 0.0001
27 instant_bookable 0.0000 0.0000
26 number_of_reviews -0.0000 0.0000
24 availability_90 0.0000 0.0000
23 availability_60 0.0000 0.0000
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
20 extra_people 0.0000 0.0000
19 guests_included 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
9 room_type 0.0000 0.0000
8 property_type -0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
3 host_response_rate 0.0000 0.0000
2 host_response_time 0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Lasso C=0.01
feature coefficient abs_coefficient
17 security_deposit -0.0013 0.0013
5 host_listings_count 0.0008 0.0008
0 host_since -0.0005 0.0005
16 weekly_price 0.0002 0.0002
15 price 0.0000 0.0000
27 instant_bookable 0.0000 0.0000
26 number_of_reviews 0.0000 0.0000
25 availability_365 0.0000 0.0000
24 availability_90 0.0000 0.0000
23 availability_60 0.0000 0.0000
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
20 extra_people 0.0000 0.0000
19 guests_included 0.0000 0.0000
18 cleaning_fee 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
9 room_type 0.0000 0.0000
8 property_type 0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
3 host_response_rate 0.0000 0.0000
2 host_response_time 0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Lasso C=1
feature coefficient abs_coefficient
17 security_deposit -0.0017 0.0017
5 host_listings_count 0.0012 0.0012
0 host_since -0.0005 0.0005
16 weekly_price 0.0002 0.0002
18 cleaning_fee 0.0002 0.0002
15 price -0.0002 0.0002
3 host_response_rate 0.0001 0.0001
26 number_of_reviews -0.0001 0.0001
25 availability_365 -0.0001 0.0001
23 availability_60 0.0001 0.0001
27 instant_bookable 0.0000 0.0000
24 availability_90 0.0000 0.0000
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
20 extra_people 0.0000 0.0000
19 guests_included 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
9 room_type 0.0000 0.0000
8 property_type -0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
2 host_response_time 0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Lasso C=0.7
feature coefficient abs_coefficient
17 security_deposit -0.0016 0.0016
5 host_listings_count 0.0012 0.0012
0 host_since -0.0005 0.0005
16 weekly_price 0.0002 0.0002
18 cleaning_fee 0.0002 0.0002
15 price -0.0002 0.0002
3 host_response_rate 0.0001 0.0001
26 number_of_reviews -0.0001 0.0001
25 availability_365 -0.0001 0.0001
23 availability_60 0.0001 0.0001
27 instant_bookable 0.0000 0.0000
24 availability_90 0.0000 0.0000
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
20 extra_people 0.0000 0.0000
19 guests_included 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
9 room_type 0.0000 0.0000
8 property_type -0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
2 host_response_time 0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Ridge Regression
feature coefficient abs_coefficient
17 security_deposit -0.0642 0.0642
5 host_listings_count 0.0251 0.0251
25 availability_365 -0.0205 0.0205
3 host_response_rate 0.0128 0.0128
23 availability_60 0.0113 0.0113
15 price -0.0108 0.0108
26 number_of_reviews -0.0093 0.0093
20 extra_people 0.0088 0.0088
24 availability_90 0.0087 0.0087
8 property_type -0.0061 0.0061
0 host_since 0.0026 0.0026
10 accommodates 0.0014 0.0014
19 guests_included 0.0013 0.0013
27 instant_bookable 0.0012 0.0012
16 weekly_price 0.0011 0.0011
2 host_response_time 0.0010 0.0010
12 bedrooms 0.0007 0.0007
9 room_type 0.0007 0.0007
13 beds -0.0006 0.0006
14 bed_type 0.0005 0.0005
18 cleaning_fee -0.0004 0.0004
21 minimum_nights 0.0004 0.0004
28 cancellation_policy 0.0004 0.0004
11 bathrooms 0.0003 0.0003
7 host_identity_verified 0.0003 0.0003
22 availability_30 -0.0002 0.0002
4 host_is_superhost -0.0002 0.0002
1 host_location 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
In [ ]:
# plot variable importance
# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp(df_coef):
# determine the variables the model is using and create df
# of their absolute coefficients
df_plt = df_coef[df_coef['abs_coefficient'] != 0]
# determine the variables the model is not using
reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()
# bar graph of the absolute coefficients that the model is using
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
sns.barplot(data=df_plt,
y=df_plt['feature'],
x=df_plt['abs_coefficient'], color="lightblue")
plt.show()
# print the variables the model is not using after the bar graph
print("-- rejected --")
for i in reject_vars:
print(f" {i}")
# plot the variable importance for the models
print("Full Logistic Regression Model")
plot_variable_imp(df_coefficients_full)
print("")
print("Lasso C=0.1")
plot_variable_imp(df_coefficients1)
print("")
print("Lasso C=0.01")
plot_variable_imp(df_coefficients01)
print("")
print("Lasso C=1")
plot_variable_imp(df_coefficients10)
print("")
print("Lasso C=0.7")
plot_variable_imp(df_coefficients7)
print("")
print("Ridge Regression")
plot_variable_imp(df_coefficients2)
Full Logistic Regression Model
-- rejected -- host_location host_has_profile_pic Lasso C=0.1
-- rejected -- instant_bookable number_of_reviews availability_90 availability_60 availability_30 minimum_nights extra_people guests_included bed_type host_location beds bedrooms bathrooms accommodates room_type property_type host_identity_verified host_has_profile_pic host_is_superhost host_response_rate host_response_time cancellation_policy Lasso C=0.01
-- rejected -- price instant_bookable number_of_reviews availability_365 availability_90 availability_60 availability_30 minimum_nights extra_people guests_included cleaning_fee bed_type host_location beds bedrooms bathrooms accommodates room_type property_type host_identity_verified host_has_profile_pic host_is_superhost host_response_rate host_response_time cancellation_policy Lasso C=1
-- rejected -- instant_bookable availability_90 availability_30 minimum_nights extra_people guests_included bed_type host_location beds bedrooms bathrooms accommodates room_type property_type host_identity_verified host_has_profile_pic host_is_superhost host_response_time cancellation_policy Lasso C=0.7
-- rejected -- instant_bookable availability_90 availability_30 minimum_nights extra_people guests_included bed_type host_location beds bedrooms bathrooms accommodates room_type property_type host_identity_verified host_has_profile_pic host_is_superhost host_response_time cancellation_policy Ridge Regression
-- rejected -- host_location host_has_profile_pic
Make Predictions To Evaluate The Models¶
In [ ]:
# make predictions on the training and testing data for all of the models to
# evaluate the models
#Full Regression
y_pred_train_full = logistic_model_prob_3.predict(X_train)
y_pred_test_full = logistic_model_prob_3.predict(X_test)
y_proba_train_full = logistic_model_prob_3.predict_proba(X_train)
y_proba_test_full = logistic_model_prob_3.predict_proba(X_test)
#Lasso C=0.1
y_pred_train = lr_l1_1_prob_3.predict(X_train)
y_pred_test = lr_l1_1_prob_3.predict(X_test)
y_proba_train = lr_l1_1_prob_3.predict_proba(X_train)
y_proba_test = lr_l1_1_prob_3.predict_proba(X_test)
#Lasso C=0.01
y_pred_train1 = lr_l1_01_prob_3.predict(X_train)
y_pred_test1 = lr_l1_01_prob_3.predict(X_test)
y_proba_train1 = lr_l1_01_prob_3.predict_proba(X_train)
y_proba_test1 = lr_l1_01_prob_3.predict_proba(X_test)
#Lasso C=1
y_pred_train10 = lr_l1_10_prob_3.predict(X_train)
y_pred_test10 = lr_l1_10_prob_3.predict(X_test)
y_proba_train10 = lr_l1_10_prob_3.predict_proba(X_train)
y_proba_test10 = lr_l1_10_prob_3.predict_proba(X_test)
#Lasso C=0.7
y_pred_train7 = lr_l1_7_prob_3.predict(X_train)
y_pred_test7 = lr_l1_7_prob_3.predict(X_test)
y_proba_train7 = lr_l1_7_prob_3.predict_proba(X_train)
y_proba_test7 = lr_l1_7_prob_3.predict_proba(X_test)
#Ridge Regression
y_pred_train2 = lr_l2_prob_3.predict(X_train)
y_pred_test2 = lr_l2_prob_3.predict(X_test)
y_proba_train2 = lr_l2_prob_3.predict_proba(X_train)
y_proba_test2 = lr_l2_prob_3.predict_proba(X_test)
Evaluate The Models¶
Full Model¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train_full)
prec_train = precision_score(y_train, y_pred_train_full, average=None)
rec_train = recall_score(y_train, y_pred_train_full, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test_full)
prec_test = precision_score(y_test, y_pred_test_full, average=None)
rec_test = recall_score(y_test, y_pred_test_full, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.6667 Precision: ['0.3333', '0.5000', '0.6552', '0.7857'] Recall: ['0.2500', '0.3333', '0.7917', '0.6471'] -- test set -- Accuracy : 0.5238 Precision: ['0.0000', '0.0000', '0.5000', '0.8571'] Recall: ['0.0000', '0.0000', '0.8333', '0.4615']
L1 with c=0.1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5208 Precision: ['0.5000', '0.0000', '0.5238', '0.5000'] Recall: ['0.2500', '0.0000', '0.9167', '0.1176'] -- test set -- Accuracy : 0.3333 Precision: ['0.0000', '0.0000', '0.3333', '0.5000'] Recall: ['0.0000', '0.0000', '1.0000', '0.0769']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L1 with c=0.01¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train1)
prec_train = precision_score(y_train, y_pred_train1, average=None)
rec_train = recall_score(y_train, y_pred_train1, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test1)
prec_test = precision_score(y_test, y_pred_test1, average=None)
rec_test = recall_score(y_test, y_pred_test1, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5208 Precision: ['0.5000', '0.0000', '0.5227', '0.5000'] Recall: ['0.2500', '0.0000', '0.9583', '0.0588'] -- test set -- Accuracy : 0.3333 Precision: ['0.0000', '0.0000', '0.3333', '0.5000'] Recall: ['0.0000', '0.0000', '1.0000', '0.0769']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L1 with C=1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train10)
prec_train = precision_score(y_train, y_pred_train10, average=None)
rec_train = recall_score(y_train, y_pred_train10, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test10)
prec_test = precision_score(y_test, y_pred_test10, average=None)
rec_test = recall_score(y_test, y_pred_test10, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5208 Precision: ['0.5000', '0.0000', '0.5238', '0.5000'] Recall: ['0.2500', '0.0000', '0.9167', '0.1176'] -- test set -- Accuracy : 0.3333 Precision: ['0.0000', '0.0000', '0.3333', '0.5000'] Recall: ['0.0000', '0.0000', '1.0000', '0.0769']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L1 with C=0.7¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train7)
prec_train = precision_score(y_train, y_pred_train7, average=None)
rec_train = recall_score(y_train, y_pred_train7, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test7)
prec_test = precision_score(y_test, y_pred_test7, average=None)
rec_test = recall_score(y_test, y_pred_test7, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5208 Precision: ['0.5000', '0.0000', '0.5238', '0.5000'] Recall: ['0.2500', '0.0000', '0.9167', '0.1176'] -- test set -- Accuracy : 0.3333 Precision: ['0.0000', '0.0000', '0.3333', '0.5000'] Recall: ['0.0000', '0.0000', '1.0000', '0.0769']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L2 Regularization¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train2)
prec_train = precision_score(y_train, y_pred_train2, average=None)
rec_train = recall_score(y_train, y_pred_train2, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test2)
prec_test = precision_score(y_test, y_pred_test2, average=None)
rec_test = recall_score(y_test, y_pred_test2, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.6667 Precision: ['0.3333', '0.5000', '0.6552', '0.7857'] Recall: ['0.2500', '0.3333', '0.7917', '0.6471'] -- test set -- Accuracy : 0.5238 Precision: ['0.0000', '0.0000', '0.5000', '0.8571'] Recall: ['0.0000', '0.0000', '0.8333', '0.4615']
Classifiers ("10" Catgories)¶
In [ ]:
def categorize_review_scores(score):
if score < 10:
return 0
elif score >= 10 and score < 20:
return 1
elif score >= 20 and score < 30:
return 2
elif score >= 30 and score < 40:
return 3
elif score >= 40 and score < 50:
return 4
elif score >= 50 and score < 60:
return 5
elif score >= 60 and score < 70:
return 6
elif score >= 70 and score < 80:
return 7
elif score >= 80 and score < 90:
return 8
else:
return 9
In [ ]:
df_prob_3_2 = df.drop(['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'], axis=1)
df_prob_3_2['review_scores_rating'] = df_prob_3_2['review_scores_rating'].apply(lambda x: categorize_review_scores(x)).astype('category')
df_prob_3_2['review_scores_rating'].head(60)
Out[ ]:
7 9 9 9 11 9 12 9 13 9 14 9 16 8 19 9 23 9 26 9 27 8 28 9 29 6 34 9 35 9 37 8 39 8 40 8 44 9 46 8 50 9 51 8 53 9 56 9 57 9 58 9 60 9 62 9 63 9 65 9 66 9 67 9 68 9 71 9 72 9 75 9 76 9 77 9 83 8 85 9 86 9 87 9 88 9 89 9 93 9 94 9 96 9 100 9 101 9 102 9 103 9 104 8 107 9 108 9 110 9 111 9 113 9 114 9 115 9 116 9 Name: review_scores_rating, dtype: category Categories (7, int64): [2, 4, 5, 6, 7, 8, 9]
In [ ]:
#No reviews are below 20
print(df_prob_3_2['review_scores_rating'] [df_prob_3_2['review_scores_rating'] == 0])
print(df_prob_3_2['review_scores_rating'][df_prob_3_2['review_scores_rating'] == 1])
Series([], Name: review_scores_rating, dtype: category Categories (7, int64): [2, 4, 5, 6, 7, 8, 9]) Series([], Name: review_scores_rating, dtype: category Categories (7, int64): [2, 4, 5, 6, 7, 8, 9])
In [ ]:
#Some review scores between 20 and 30
print(df_prob_3_2['review_scores_rating'][df_prob_3_2['review_scores_rating'] == 2])
955 2 4856 2 4881 2 5151 2 5635 2 Name: review_scores_rating, dtype: category Categories (7, int64): [2, 4, 5, 6, 7, 8, 9]
In [ ]:
#No review scores between 30 and 40
print(df_prob_3_2['review_scores_rating'][df_prob_3_2['review_scores_rating'] == 3])
Series([], Name: review_scores_rating, dtype: category Categories (7, int64): [2, 4, 5, 6, 7, 8, 9])
In [ ]:
df_prob_3_2.groupby('review_scores_rating').count()
Out[ ]:
| host_since | host_location | host_response_time | host_response_rate | host_is_superhost | host_listings_count | host_has_profile_pic | host_identity_verified | property_type | room_type | ... | guests_included | extra_people | minimum_nights | availability_30 | availability_60 | availability_90 | availability_365 | number_of_reviews | instant_bookable | cancellation_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| review_scores_rating | |||||||||||||||||||||
| 2 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | ... | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |
| 4 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | ... | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 6 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | ... | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 |
| 7 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | ... | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 |
| 8 | 318 | 318 | 318 | 318 | 318 | 318 | 318 | 318 | 318 | 318 | ... | 318 | 318 | 318 | 318 | 318 | 318 | 318 | 318 | 318 | 318 |
| 9 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | ... | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 | 3002 |
7 rows × 29 columns
In [ ]:
df_prob_3_sampled = df_prob_3_2.groupby('review_scores_rating').apply(lambda s: s.sample(min(len(s), 200)))
In [ ]:
#Split the data on what the model is learning to predict, whether an AirBnB will be booked
X = df_prob_3_sampled.drop('review_scores_rating', axis=1)
y = df_prob_3_sampled['review_scores_rating']
#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)
X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 316 entries, (9, 4410) to (8, 2404) Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 host_since 316 non-null int64 1 host_location 316 non-null category 2 host_response_time 316 non-null int64 3 host_response_rate 316 non-null float64 4 host_is_superhost 316 non-null category 5 host_listings_count 316 non-null float64 6 host_has_profile_pic 316 non-null category 7 host_identity_verified 316 non-null category 8 property_type 316 non-null int64 9 room_type 316 non-null int64 10 accommodates 316 non-null int64 11 bathrooms 316 non-null float64 12 bedrooms 316 non-null float64 13 beds 316 non-null float64 14 bed_type 316 non-null int64 15 price 316 non-null float64 16 weekly_price 316 non-null float64 17 security_deposit 316 non-null float64 18 cleaning_fee 316 non-null float64 19 guests_included 316 non-null int64 20 extra_people 316 non-null float64 21 minimum_nights 316 non-null int64 22 availability_30 316 non-null int64 23 availability_60 316 non-null int64 24 availability_90 316 non-null int64 25 availability_365 316 non-null int64 26 number_of_reviews 316 non-null int64 27 instant_bookable 316 non-null category 28 cancellation_policy 316 non-null int64 dtypes: category(5), float64(10), int64(14) memory usage: 82.6 KB <class 'pandas.core.series.Series'> MultiIndex: 316 entries, (9, 4410) to (8, 2404) Series name: review_scores_rating Non-Null Count Dtype -------------- ----- 316 non-null category dtypes: category(1) memory usage: 21.9 KB
Create And Assess Decision Tree Classifiers¶
Default Tree¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_3 = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)
# fit the model to the training data
dt_prob_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3, filled=True, rounded=True, feature_names=X.columns, class_names=['2','4','5','6','7','8','9'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default_prob_3")
Out[ ]:
'decision_tree_default_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3,
'tree.dot',
class_names=['2','4','5','6','7','8','9'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c250dde0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3.predict(X_train)
y_pred_test = dt_prob_3.predict(X_test)
y_prob_train = dt_prob_3.predict_proba(X_train)
y_prob_test = dt_prob_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.7152 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.7143', '0.7160'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.7971', '0.8056'] -- test set -- Accuracy : 0.4926 Precision: ['0.0000', '0.0000', '0.0000', '0.5000', '0.4861'] Recall: ['0.0000', '0.0000', '0.0000', '0.5161', '0.6250']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 4 0 0 5 7 110 28] [ 1 1 2 8 6 28 116]] [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 1 7 3 32 21] [ 0 5 2 30 35]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape # (n_samples, n_classes)
Out[ ]:
(136, 7)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['2','4','5','6','7','8','9']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score: 0.84
In [ ]:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score: nan
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))
plt.plot(
fpr["micro"],
tpr["micro"],
label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
color="deeppink",
linestyle=":",
linewidth=4,
)
plt.plot(
fpr["macro"],
tpr["macro"],
label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
color="navy",
linestyle=":",
linewidth=4,
)
colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen", "pink", "purple"])
for class_id, color in zip(range(n_classes-1), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_prob_test[:, class_id],
name=f"ROC curve for {n_names[class_id]}",
color=color,
ax=ax,
#plot_chance_level=(class_id == 2),
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 16 0.233062 weekly_price 4 0.200269 host_is_superhost 17 0.092846 security_deposit 26 0.074683 number_of_reviews 18 0.058502 cleaning_fee 8 0.056258 property_type 24 0.055359 availability_90 23 0.053939 availability_60 0 0.046475 host_since 20 0.045182 extra_people 19 0.037713 guests_included 28 0.030051 cancellation_policy 10 0.015662 accommodates 12 0.000000 bedrooms 21 0.000000 minimum_nights 27 0.000000 instant_bookable 2 0.000000 host_response_time 25 0.000000 availability_365 3 0.000000 host_response_rate 5 0.000000 host_listings_count 22 0.000000 availability_30 6 0.000000 host_has_profile_pic 13 0.000000 beds 7 0.000000 host_identity_verified 9 0.000000 room_type 11 0.000000 bathrooms 15 0.000000 price 1 0.000000 host_location 14 0.000000 bed_type
Tuned Tree¶
In [ ]:
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the training set so we have "tuned" parameters or values
dt_tune_prob_3 = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune_prob_3, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
print(best_estimator)
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(
{'ccp_alpha': 0, 'max_depth': 5, 'min_samples_leaf': 20}
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=20)
In [ ]:
# create an instance of a decision tree classifier using "tuned" values
dt_tuned_prob_3 = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=20, ccp_alpha = 0)
# fit the model to the training data
dt_tuned_prob_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=20)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=20)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_prob_3, filled=True, rounded=True, feature_names=X.columns, class_names=['2','4','5','6','7','8','9'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_prob_3")
Out[ ]:
'decision_tree_tuned_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_prob_3,
'tree.dot',
class_names=['2','4','5','6','7','8','9'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c4b359c0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned_prob_3.predict(X_train)
y_pred_test = dt_tuned_prob_3.predict(X_test)
y_prob_train = dt_tuned_prob_3.predict_proba(X_train)
y_prob_test = dt_tuned_prob_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.6361 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.5756', '0.7477'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.8551', '0.5764'] -- test set -- Accuracy : 0.5441 Precision: ['0.0000', '0.0000', '0.0000', '0.5000', '0.6364'] Recall: ['0.0000', '0.0000', '0.0000', '0.7419', '0.5000']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 5 0 1 10 10 118 61] [ 0 1 1 3 3 20 83]] [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 1 12 5 46 28] [ 0 0 0 16 28]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape # (n_samples, n_classes)
Out[ ]:
(136, 7)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['2','4','5','6','7','8','9']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score: 0.88
In [ ]:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score: nan
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))
plt.plot(
fpr["micro"],
tpr["micro"],
label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
color="deeppink",
linestyle=":",
linewidth=4,
)
plt.plot(
fpr["macro"],
tpr["macro"],
label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
color="navy",
linestyle=":",
linewidth=4,
)
colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen", "pink", "purple"])
for class_id, color in zip(range(n_classes-1), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_prob_test[:, class_id],
name=f"ROC curve for {n_names[class_id]}",
color=color,
ax=ax,
#plot_chance_level=(class_id == 2),
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
In [ ]:
#calculate feature importance
tree_imp = dt_tuned_prob_3.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 4 0.334405 host_is_superhost 16 0.288412 weekly_price 17 0.155032 security_deposit 26 0.071443 number_of_reviews 19 0.062972 guests_included 22 0.059257 availability_30 21 0.028479 minimum_nights 0 0.000000 host_since 15 0.000000 price 27 0.000000 instant_bookable 25 0.000000 availability_365 24 0.000000 availability_90 23 0.000000 availability_60 20 0.000000 extra_people 18 0.000000 cleaning_fee 14 0.000000 bed_type 1 0.000000 host_location 13 0.000000 beds 12 0.000000 bedrooms 11 0.000000 bathrooms 10 0.000000 accommodates 9 0.000000 room_type 8 0.000000 property_type 7 0.000000 host_identity_verified 6 0.000000 host_has_profile_pic 5 0.000000 host_listings_count 3 0.000000 host_response_rate 2 0.000000 host_response_time 28 0.000000 cancellation_policy
Less Complex Tree¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_3_2 = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=20, ccp_alpha = 0.01)
# fit the model to the training data
dt_prob_3_2.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.01, max_depth=5, min_samples_leaf=20)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.01, max_depth=5, min_samples_leaf=20)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3_2, filled=True, rounded=True, feature_names=X.columns, class_names=['2','4','5','6','7','8','9'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_less_complex_prob_3")
Out[ ]:
'decision_tree_less_complex_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3_2,
'tree.dot',
class_names=['2','4','5','6','7','8','9'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57ddd4f0d0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3_2.predict(X_train)
y_pred_test = dt_prob_3_2.predict(X_test)
y_prob_train = dt_prob_3_2.predict_proba(X_train)
y_prob_test = dt_prob_3_2.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.6139 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.5507', '0.7753'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.9058', '0.4792'] -- test set -- Accuracy : 0.5882 Precision: ['0.0000', '0.0000', '0.0000', '0.5300', '0.7500'] Recall: ['0.0000', '0.0000', '0.0000', '0.8548', '0.4821']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 5 0 1 10 11 125 75] [ 0 1 1 3 2 13 69]] [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 1 12 5 53 29] [ 0 0 0 9 27]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape # (n_samples, n_classes)
Out[ ]:
(136, 7)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['2','4','5','6','7','8','9']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score: 0.90
In [ ]:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score: nan
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))
plt.plot(
fpr["micro"],
tpr["micro"],
label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
color="deeppink",
linestyle=":",
linewidth=4,
)
plt.plot(
fpr["macro"],
tpr["macro"],
label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
color="navy",
linestyle=":",
linewidth=4,
)
colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen", "pink", "purple"])
for class_id, color in zip(range(n_classes-1), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_prob_test[:, class_id],
name=f"ROC curve for {n_names[class_id]}",
color=color,
ax=ax,
#plot_chance_level=(class_id == 2),
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3_2.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 4 0.455216 host_is_superhost 16 0.333744 weekly_price 17 0.211040 security_deposit 0 0.000000 host_since 15 0.000000 price 27 0.000000 instant_bookable 26 0.000000 number_of_reviews 25 0.000000 availability_365 24 0.000000 availability_90 23 0.000000 availability_60 22 0.000000 availability_30 21 0.000000 minimum_nights 20 0.000000 extra_people 19 0.000000 guests_included 18 0.000000 cleaning_fee 14 0.000000 bed_type 1 0.000000 host_location 13 0.000000 beds 12 0.000000 bedrooms 11 0.000000 bathrooms 10 0.000000 accommodates 9 0.000000 room_type 8 0.000000 property_type 7 0.000000 host_identity_verified 6 0.000000 host_has_profile_pic 5 0.000000 host_listings_count 3 0.000000 host_response_rate 2 0.000000 host_response_time 28 0.000000 cancellation_policy
Less Min Tree¶
In [ ]:
# create an instance of a decision tree classifier using default values
dt_prob_3_3 = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=10, ccp_alpha = 0)
# fit the model to the training data
dt_prob_3_3.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_prob_3_3, filled=True, rounded=True, feature_names=X.columns, class_names=['2','4','5','6','7','8','9'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_less_min_prob_3")
Out[ ]:
'decision_tree_less_min_prob_3.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_prob_3_3,
'tree.dot',
class_names=['2','4','5','6','7','8','9'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7c57c458db40>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_prob_3_3.predict(X_train)
y_pred_test = dt_prob_3_3.predict(X_test)
y_prob_train = dt_prob_3_3.predict_proba(X_train)
y_prob_test = dt_prob_3_3.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.6614 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6645', '0.6585'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.7319', '0.7500'] -- test set -- Accuracy : 0.5147 Precision: ['0.0000', '0.0000', '0.0000', '0.5000', '0.5312'] Recall: ['0.0000', '0.0000', '0.0000', '0.5806', '0.6071']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0] [ 1 0 1 4 9 101 36] [ 4 1 1 9 4 37 108]] [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 1 9 4 36 22] [ 0 3 1 26 34]]
In [ ]:
#Binarize by one-hot encoding to be able to generate ROC curve
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape # (n_samples, n_classes)
Out[ ]:
(136, 7)
In [ ]:
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
In [ ]:
n_names = ['2','4','5','6','7','8','9']
In [ ]:
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_prob_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")
Micro-averaged One-vs-Rest ROC AUC score: 0.87
In [ ]:
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_prob_test[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")
Macro-averaged One-vs-Rest ROC AUC score: nan
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
In [ ]:
fig, ax = plt.subplots(figsize=(6, 6))
plt.plot(
fpr["micro"],
tpr["micro"],
label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
color="deeppink",
linestyle=":",
linewidth=4,
)
plt.plot(
fpr["macro"],
tpr["macro"],
label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
color="navy",
linestyle=":",
linewidth=4,
)
colors = cycle(["aqua", "darkorange", "cornflowerblue", "lightgreen", "pink", "purple"])
for class_id, color in zip(range(n_classes-1), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_prob_test[:, class_id],
name=f"ROC curve for {n_names[class_id]}",
color=color,
ax=ax,
#plot_chance_level=(class_id == 2),
)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass")
plt.legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
In [ ]:
#calculate feature importance
tree_imp = dt_prob_3_3.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df_tree = pd.DataFrame(data = tree_imp,
columns = column_values)
df_tree['feature']=X.columns
#sort data so features with largest importance values are at the top
df_tree2 = df_tree.sort_values(by=['importance'], ascending=False)
print(df_tree2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df_tree2,
y=df_tree2['feature'],
x=df_tree2['importance'], color="lightblue")
plt.show()
importance feature 4 0.275032 host_is_superhost 16 0.260477 weekly_price 17 0.127506 security_deposit 26 0.102563 number_of_reviews 0 0.063825 host_since 20 0.062049 extra_people 19 0.051791 guests_included 13 0.021509 beds 23 0.018408 availability_60 18 0.016840 cleaning_fee 27 0.000000 instant_bookable 25 0.000000 availability_365 24 0.000000 availability_90 22 0.000000 availability_30 21 0.000000 minimum_nights 14 0.000000 bed_type 15 0.000000 price 1 0.000000 host_location 12 0.000000 bedrooms 11 0.000000 bathrooms 10 0.000000 accommodates 9 0.000000 room_type 8 0.000000 property_type 7 0.000000 host_identity_verified 6 0.000000 host_has_profile_pic 5 0.000000 host_listings_count 3 0.000000 host_response_rate 2 0.000000 host_response_time 28 0.000000 cancellation_policy
Create And Assess Logistic Regression Models¶
Full Logistic¶
In [ ]:
# define the multinomial logistic regression model
logistic_model_prob_3 = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# fit the model on the training data
logistic_model_prob_3.fit(X_train, y_train)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression(multi_class='multinomial')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial')
Create The LASSO and Ridge Regression Models¶
In [ ]:
# Create an Instance of Logistic Regression for LASSO Selection using c = 0.1 and c = 0.01
lr_l1_1_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.1)
lr_l1_01_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.01)
# fit the models to the training data
lr_l1_1_prob_3.fit(X_train, y_train)
lr_l1_01_prob_3.fit(X_train, y_train)
# Create an Instance of Logistic Regression for LASSO Selection using c = 1 and c = 0.7
lr_l1_10_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=1)
lr_l1_7_prob_3 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=0.7)
# fit the models to the training data
lr_l1_10_prob_3.fit(X_train, y_train)
lr_l1_7_prob_3.fit(X_train, y_train)
# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2_prob_3 = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2')
# fit the models to the training data
lr_l2_prob_3.fit(X_train, y_train)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression(multi_class='multinomial')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='multinomial')
Analyze The Importance Of Different Categories In The Models¶
In [ ]:
# function for model coefficients
def rpt_model_variables(model):
# Get the intercept term
intercept = model.intercept_
# Access the coefficients (weights) of the model, i rounded them
coefficients = np.round(model.coef_[0],decimals=4)
# Create DataFrames for intercept and coefficients
#df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
df_coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': coefficients})
df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)
# if you want to add intercept to table
#df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)
# Print the DataFrame
print(df_coefficients)
return df_coefficients
#Evalute the model coefficients for the models
print("Full Logistic Regression Model")
df_coefficients_full = rpt_model_variables(logistic_model_prob_3)
print("Lasso C=0.1")
df_coefficients1 = rpt_model_variables(lr_l1_1_prob_3)
print("")
print("Lasso C=0.01")
df_coefficients01 = rpt_model_variables(lr_l1_01_prob_3)
print("")
print("Lasso C=1")
df_coefficients10 = rpt_model_variables(lr_l1_10_prob_3)
print("")
print("Lasso C=0.7")
df_coefficients7 = rpt_model_variables(lr_l1_7_prob_3)
print("")
print("Ridge Regression")
df_coefficients2 = rpt_model_variables(lr_l2_prob_3)
Full Logistic Regression Model
feature coefficient abs_coefficient
17 security_deposit -0.0084 0.0084
5 host_listings_count 0.0035 0.0035
25 availability_365 -0.0021 0.0021
26 number_of_reviews -0.0013 0.0013
3 host_response_rate 0.0007 0.0007
23 availability_60 0.0006 0.0006
15 price -0.0004 0.0004
8 property_type -0.0004 0.0004
20 extra_people -0.0004 0.0004
24 availability_90 0.0003 0.0003
18 cleaning_fee -0.0002 0.0002
10 accommodates 0.0001 0.0001
16 weekly_price 0.0001 0.0001
2 host_response_time 0.0001 0.0001
0 host_since -0.0001 0.0001
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
19 guests_included 0.0000 0.0000
27 instant_bookable 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms -0.0000 0.0000
9 room_type 0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic -0.0000 0.0000
4 host_is_superhost -0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Lasso C=0.1
feature coefficient abs_coefficient
17 security_deposit -0.0012 0.0012
5 host_listings_count 0.0006 0.0006
0 host_since -0.0005 0.0005
16 weekly_price 0.0002 0.0002
26 number_of_reviews -0.0001 0.0001
25 availability_365 -0.0001 0.0001
15 price 0.0000 0.0000
27 instant_bookable 0.0000 0.0000
24 availability_90 0.0000 0.0000
23 availability_60 0.0000 0.0000
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
20 extra_people -0.0000 0.0000
19 guests_included 0.0000 0.0000
18 cleaning_fee 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
9 room_type 0.0000 0.0000
8 property_type -0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
3 host_response_rate 0.0000 0.0000
2 host_response_time 0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Lasso C=0.01
feature coefficient abs_coefficient
17 security_deposit -0.0011 0.0011
0 host_since -0.0005 0.0005
5 host_listings_count 0.0005 0.0005
16 weekly_price 0.0002 0.0002
15 price 0.0000 0.0000
27 instant_bookable 0.0000 0.0000
26 number_of_reviews 0.0000 0.0000
25 availability_365 0.0000 0.0000
24 availability_90 0.0000 0.0000
23 availability_60 0.0000 0.0000
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
20 extra_people 0.0000 0.0000
19 guests_included 0.0000 0.0000
18 cleaning_fee 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
9 room_type 0.0000 0.0000
8 property_type 0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
3 host_response_rate 0.0000 0.0000
2 host_response_time 0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Lasso C=1
feature coefficient abs_coefficient
17 security_deposit -0.0012 0.0012
5 host_listings_count 0.0006 0.0006
0 host_since -0.0005 0.0005
16 weekly_price 0.0002 0.0002
26 number_of_reviews -0.0001 0.0001
25 availability_365 -0.0001 0.0001
20 extra_people -0.0001 0.0001
15 price 0.0000 0.0000
27 instant_bookable 0.0000 0.0000
24 availability_90 0.0000 0.0000
23 availability_60 0.0000 0.0000
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
19 guests_included -0.0000 0.0000
18 cleaning_fee 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
9 room_type 0.0000 0.0000
8 property_type -0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
3 host_response_rate 0.0000 0.0000
2 host_response_time 0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Lasso C=0.7
feature coefficient abs_coefficient
17 security_deposit -0.0012 0.0012
5 host_listings_count 0.0006 0.0006
0 host_since -0.0005 0.0005
16 weekly_price 0.0002 0.0002
26 number_of_reviews -0.0001 0.0001
25 availability_365 -0.0001 0.0001
20 extra_people -0.0001 0.0001
15 price 0.0000 0.0000
27 instant_bookable 0.0000 0.0000
24 availability_90 0.0000 0.0000
23 availability_60 0.0000 0.0000
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
19 guests_included -0.0000 0.0000
18 cleaning_fee 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms 0.0000 0.0000
10 accommodates 0.0000 0.0000
9 room_type 0.0000 0.0000
8 property_type -0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic 0.0000 0.0000
4 host_is_superhost 0.0000 0.0000
3 host_response_rate 0.0000 0.0000
2 host_response_time 0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
Ridge Regression
feature coefficient abs_coefficient
17 security_deposit -0.0084 0.0084
5 host_listings_count 0.0035 0.0035
25 availability_365 -0.0021 0.0021
26 number_of_reviews -0.0013 0.0013
3 host_response_rate 0.0007 0.0007
23 availability_60 0.0006 0.0006
15 price -0.0004 0.0004
8 property_type -0.0004 0.0004
20 extra_people -0.0004 0.0004
24 availability_90 0.0003 0.0003
18 cleaning_fee -0.0002 0.0002
10 accommodates 0.0001 0.0001
16 weekly_price 0.0001 0.0001
2 host_response_time 0.0001 0.0001
0 host_since -0.0001 0.0001
22 availability_30 0.0000 0.0000
21 minimum_nights 0.0000 0.0000
19 guests_included 0.0000 0.0000
27 instant_bookable 0.0000 0.0000
14 bed_type 0.0000 0.0000
1 host_location 0.0000 0.0000
13 beds 0.0000 0.0000
12 bedrooms 0.0000 0.0000
11 bathrooms -0.0000 0.0000
9 room_type 0.0000 0.0000
7 host_identity_verified 0.0000 0.0000
6 host_has_profile_pic -0.0000 0.0000
4 host_is_superhost -0.0000 0.0000
28 cancellation_policy 0.0000 0.0000
In [ ]:
# plot variable importance
# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp(df_coef):
# determine the variables the model is using and create df
# of their absolute coefficients
df_plt = df_coef[df_coef['abs_coefficient'] != 0]
# determine the variables the model is not using
reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()
# bar graph of the absolute coefficients that the model is using
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
sns.barplot(data=df_plt,
y=df_plt['feature'],
x=df_plt['abs_coefficient'], color="lightblue")
plt.show()
# print the variables the model is not using after the bar graph
print("-- rejected --")
for i in reject_vars:
print(f" {i}")
# plot the variable importance for the models
print("Full Logistic Regression Model")
plot_variable_imp(df_coefficients_full)
print("")
print("Lasso C=0.1")
plot_variable_imp(df_coefficients1)
print("")
print("Lasso C=0.01")
plot_variable_imp(df_coefficients01)
print("")
print("Lasso C=1")
plot_variable_imp(df_coefficients10)
print("")
print("Lasso C=0.7")
plot_variable_imp(df_coefficients7)
print("")
print("Ridge Regression")
plot_variable_imp(df_coefficients2)
Full Logistic Regression Model
-- rejected -- availability_30 minimum_nights guests_included instant_bookable bed_type host_location beds bedrooms bathrooms room_type host_identity_verified host_has_profile_pic host_is_superhost cancellation_policy Lasso C=0.1
-- rejected -- price instant_bookable availability_90 availability_60 availability_30 minimum_nights extra_people guests_included cleaning_fee bed_type host_location beds bedrooms bathrooms accommodates room_type property_type host_identity_verified host_has_profile_pic host_is_superhost host_response_rate host_response_time cancellation_policy Lasso C=0.01
-- rejected -- price instant_bookable number_of_reviews availability_365 availability_90 availability_60 availability_30 minimum_nights extra_people guests_included cleaning_fee bed_type host_location beds bedrooms bathrooms accommodates room_type property_type host_identity_verified host_has_profile_pic host_is_superhost host_response_rate host_response_time cancellation_policy Lasso C=1
-- rejected -- price instant_bookable availability_90 availability_60 availability_30 minimum_nights guests_included cleaning_fee bed_type host_location beds bedrooms bathrooms accommodates room_type property_type host_identity_verified host_has_profile_pic host_is_superhost host_response_rate host_response_time cancellation_policy Lasso C=0.7
-- rejected -- price instant_bookable availability_90 availability_60 availability_30 minimum_nights guests_included cleaning_fee bed_type host_location beds bedrooms bathrooms accommodates room_type property_type host_identity_verified host_has_profile_pic host_is_superhost host_response_rate host_response_time cancellation_policy Ridge Regression
-- rejected -- availability_30 minimum_nights guests_included instant_bookable bed_type host_location beds bedrooms bathrooms room_type host_identity_verified host_has_profile_pic host_is_superhost cancellation_policy
Make Predictions To Evaluate The Models¶
In [ ]:
# make predictions on the training and testing data for all of the models to
# evaluate the models
#Full Regression
y_pred_train_full = logistic_model_prob_3.predict(X_train)
y_pred_test_full = logistic_model_prob_3.predict(X_test)
y_proba_train_full = logistic_model_prob_3.predict_proba(X_train)
y_proba_test_full = logistic_model_prob_3.predict_proba(X_test)
#Lasso C=0.1
y_pred_train = lr_l1_1_prob_3.predict(X_train)
y_pred_test = lr_l1_1_prob_3.predict(X_test)
y_proba_train = lr_l1_1_prob_3.predict_proba(X_train)
y_proba_test = lr_l1_1_prob_3.predict_proba(X_test)
#Lasso C=0.01
y_pred_train1 = lr_l1_01_prob_3.predict(X_train)
y_pred_test1 = lr_l1_01_prob_3.predict(X_test)
y_proba_train1 = lr_l1_01_prob_3.predict_proba(X_train)
y_proba_test1 = lr_l1_01_prob_3.predict_proba(X_test)
#Lasso C=1
y_pred_train10 = lr_l1_10_prob_3.predict(X_train)
y_pred_test10 = lr_l1_10_prob_3.predict(X_test)
y_proba_train10 = lr_l1_10_prob_3.predict_proba(X_train)
y_proba_test10 = lr_l1_10_prob_3.predict_proba(X_test)
#Lasso C=0.7
y_pred_train7 = lr_l1_7_prob_3.predict(X_train)
y_pred_test7 = lr_l1_7_prob_3.predict(X_test)
y_proba_train7 = lr_l1_7_prob_3.predict_proba(X_train)
y_proba_test7 = lr_l1_7_prob_3.predict_proba(X_test)
#Ridge Regression
y_pred_train2 = lr_l2_prob_3.predict(X_train)
y_pred_test2 = lr_l2_prob_3.predict(X_test)
y_proba_train2 = lr_l2_prob_3.predict_proba(X_train)
y_proba_test2 = lr_l2_prob_3.predict_proba(X_test)
Evaluate The Models¶
Full Model¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train_full)
prec_train = precision_score(y_train, y_pred_train_full, average=None)
rec_train = recall_score(y_train, y_pred_train_full, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test_full)
prec_test = precision_score(y_test, y_pred_test_full, average=None)
rec_test = recall_score(y_test, y_pred_test_full, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5854 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.5542', '0.6200'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6667', '0.6458'] -- test set -- Accuracy : 0.5147 Precision: ['0.0000', '0.0000', '0.0000', '0.4872', '0.5517'] Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5714']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L1 with c=0.1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average=None)
rec_train = recall_score(y_train, y_pred_train, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test, average=None)
rec_test = recall_score(y_test, y_pred_test, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5190 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.4911', '0.5510'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6014', '0.5625'] -- test set -- Accuracy : 0.5221 Precision: ['0.0000', '0.0000', '0.0000', '0.5205', '0.5238'] Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5893']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L1 with c=0.01¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train1)
prec_train = precision_score(y_train, y_pred_train1, average=None)
rec_train = recall_score(y_train, y_pred_train1, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test1)
prec_test = precision_score(y_test, y_pred_test1, average=None)
rec_test = recall_score(y_test, y_pred_test1, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5222 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.4940', '0.5541'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6014', '0.5694'] -- test set -- Accuracy : 0.5368 Precision: ['0.0000', '0.0000', '0.0000', '0.5352', '0.5385'] Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.6250']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L1 with C=1¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train10)
prec_train = precision_score(y_train, y_pred_train10, average=None)
rec_train = recall_score(y_train, y_pred_train10, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test10)
prec_test = precision_score(y_test, y_pred_test10, average=None)
rec_test = recall_score(y_test, y_pred_test10, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5190 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.4911', '0.5510'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6014', '0.5625'] -- test set -- Accuracy : 0.5221 Precision: ['0.0000', '0.0000', '0.0000', '0.5205', '0.5238'] Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5893']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L1 with C=0.7¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train7)
prec_train = precision_score(y_train, y_pred_train7, average=None)
rec_train = recall_score(y_train, y_pred_train7, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test7)
prec_test = precision_score(y_test, y_pred_test7, average=None)
rec_test = recall_score(y_test, y_pred_test7, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5190 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.4911', '0.5510'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6014', '0.5625'] -- test set -- Accuracy : 0.5221 Precision: ['0.0000', '0.0000', '0.0000', '0.5205', '0.5238'] Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5893']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
L2 Regularization¶
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train2)
prec_train = precision_score(y_train, y_pred_train2, average=None)
rec_train = recall_score(y_train, y_pred_train2, average=None)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print(f"Precision: {list(map('{:.4f}'.format,prec_train))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_train))}")
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test2)
prec_test = precision_score(y_test, y_pred_test2, average=None)
rec_test = recall_score(y_test, y_pred_test2, average=None)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print(f"Precision: {list(map('{:.4f}'.format,prec_test))}")
print(f"Recall: {list(map('{:.4f}'.format,rec_test))}")
-- train set -- Accuracy : 0.5854 Precision: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.5542', '0.6200'] Recall: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.6667', '0.6458'] -- test set -- Accuracy : 0.5147 Precision: ['0.0000', '0.0000', '0.0000', '0.4872', '0.5517'] Recall: ['0.0000', '0.0000', '0.0000', '0.6129', '0.5714']
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Exploratory Analysis¶
In [ ]:
df_prob_2
Out[ ]:
| host_since | host_location | host_response_time | host_response_rate | host_is_superhost | host_listings_count | host_has_profile_pic | host_identity_verified | property_type | room_type | ... | availability_90 | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | cancellation_policy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 2014 | 1 | 3 | 100.0 | 0 | 1.0 | 1 | 1 | 9 | 1 | ... | 0 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 1 | 1 |
| 9 | 2012 | 1 | 3 | 100.0 | 1 | 1.0 | 1 | 1 | 9 | 1 | ... | 0 | 99.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 1 |
| 11 | 2011 | 1 | 3 | 100.0 | 0 | 1.0 | 1 | 1 | 9 | 1 | ... | 0 | 93.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 3 |
| 12 | 2013 | 1 | 3 | 97.0 | 1 | 9.0 | 1 | 1 | 9 | 1 | ... | 0 | 100.0 | 9.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 3 |
| 13 | 2013 | 1 | 3 | 97.0 | 1 | 9.0 | 1 | 1 | 9 | 1 | ... | 0 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5826 | 2013 | 1 | 2 | 100.0 | 0 | 339.0 | 1 | 1 | 0 | 0 | ... | 0 | 85.0 | 9.0 | 9.0 | 9.0 | 9.0 | 10.0 | 8.0 | 0 | 3 |
| 5827 | 2011 | 1 | 2 | 87.0 | 0 | 19.0 | 1 | 1 | 0 | 0 | ... | 0 | 94.0 | 9.0 | 9.0 | 10.0 | 9.0 | 10.0 | 9.0 | 0 | 3 |
| 5829 | 2012 | 1 | 2 | 95.0 | 0 | 11.0 | 1 | 0 | 0 | 0 | ... | 0 | 88.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 0 | 1 |
| 5830 | 2013 | 1 | 2 | 100.0 | 1 | 21.0 | 1 | 1 | 0 | 0 | ... | 0 | 100.0 | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 | 9.0 | 0 | 3 |
| 5832 | 2014 | 0 | 3 | 100.0 | 0 | 5.0 | 1 | 0 | 9 | 0 | ... | 0 | 100.0 | 8.0 | 10.0 | 10.0 | 10.0 | 10.0 | 8.0 | 1 | 3 |
3372 rows × 32 columns
In [ ]:
booked_avg_price = df_prob_2.groupby('availability_90')['price'].mean().reset_index()
booked_avg_price['availability_90'] = booked_avg_price['availability_90'].astype('string')
booked_avg_price['availability_90'][0] = "Non-Booked"
booked_avg_price['availability_90'][1] = "Booked"
booked_avg_price
Out[ ]:
| availability_90 | price | |
|---|---|---|
| 0 | Non-Booked | 211.695888 |
| 1 | Booked | 199.032746 |
In [ ]:
#Plot the average price of booked and nonbooked AirBnBs
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=booked_avg_price,
x='availability_90', y='price', color='lightblue')
plot.yaxis.set_major_formatter('${x:1.0f}') # Make y-axis currency
#Give the column chart a title
plt.title("Average Price Of Booked And Non-Booked AirBnBs", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+3, # y-coordinate position of data label, padded 3 above bar
s = '${:.2f}'.format(height), # data label format of currency
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
In [ ]:
booked_avg_review_score = df_prob_2.groupby('availability_90')['review_scores_rating'].mean().reset_index()
booked_avg_review_score
booked_avg_review_score['availability_90'] = booked_avg_price['availability_90'].astype('string')
booked_avg_review_score['availability_90'][0] = "Non-Booked"
booked_avg_review_score['availability_90'][1] = "Booked"
booked_avg_review_score['review_scores_rating'] = booked_avg_review_score['review_scores_rating'] / 100
booked_avg_review_score
Out[ ]:
| availability_90 | review_scores_rating | |
|---|---|---|
| 0 | Non-Booked | 0.953491 |
| 1 | Booked | 0.959433 |
In [ ]:
#Plot the average price of booked and nonbooked AirBnBs
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=booked_avg_review_score,
x='availability_90', y='review_scores_rating', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
#Give the column chart a title
plt.title("Average Review Scores Of Booked And Non-Booked AirBnBs", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+0.01, # y-coordinate position of data label, padded 0.01 above bar
s = '{:.2%}'.format(height), # data label format of percentage
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
In [ ]:
booked_avg_min_nights = df_prob_2.groupby('availability_90')['minimum_nights'].mean().reset_index()
booked_avg_min_nights['availability_90'] = booked_avg_min_nights['availability_90'].astype('string')
booked_avg_min_nights['availability_90'][0] = "Non-Booked"
booked_avg_min_nights['availability_90'][1] = "Booked"
booked_avg_min_nights
Out[ ]:
| availability_90 | minimum_nights | |
|---|---|---|
| 0 | Non-Booked | 1.911947 |
| 1 | Booked | 2.134761 |
In [ ]:
#Plot the average price of booked and nonbooked AirBnBs
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=booked_avg_min_nights,
x='availability_90', y='minimum_nights', color='lightblue')
#Give the column chart a title
plt.title("Average Minimum Nights Of Booked And Non-Booked AirBnBs", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+0.02, # y-coordinate position of data label, padded 3 above bar
s = '{:.2f}'.format(height), # data label format of currency
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
In [ ]:
booked_min_nights = df_prob_2.groupby('availability_90')['minimum_nights'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_min_nights['availability_90'] = booked_avg_min_nights['availability_90'].astype('string')
booked_min_nights['availability_90'][0] = "Non-Booked"
booked_min_nights['availability_90'][1] = "Booked"
booked_min_nights
Out[ ]:
| availability_90 | min | mean | median | max | |
|---|---|---|---|---|---|
| 0 | Non-Booked | 1 | 1.911947 | 2.0 | 45 |
| 1 | Booked | 1 | 2.134761 | 2.0 | 60 |
In [ ]:
booked_avg_sec_deposit = df_prob_2.groupby('availability_90')['security_deposit'].mean().reset_index()
booked_avg_sec_deposit['availability_90'] = booked_avg_sec_deposit['availability_90'].astype('string')
booked_avg_sec_deposit['availability_90'][0] = "Non-Booked"
booked_avg_sec_deposit['availability_90'][1] = "Booked"
booked_avg_sec_deposit
Out[ ]:
| availability_90 | security_deposit | |
|---|---|---|
| 0 | Non-Booked | 193.270365 |
| 1 | Booked | 189.195214 |
In [ ]:
#Plot the average price of booked and nonbooked AirBnBs
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=booked_avg_sec_deposit,
x='availability_90', y='security_deposit', color='lightblue')
plot.yaxis.set_major_formatter('${x:1.0f}') # Make y-axis currency
#Give the column chart a title
plt.title("Average Security Deposit Of Booked And Non-Booked AirBnBs", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+3, # y-coordinate position of data label, padded 3 above bar
s = '${:.2f}'.format(height), # data label format of currency
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
In [ ]:
booked_sec_deposit = df_prob_2.groupby('availability_90')['security_deposit'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_sec_deposit['availability_90'] = booked_sec_deposit['availability_90'].astype('string')
booked_sec_deposit['availability_90'][0] = "Non-Booked"
booked_sec_deposit['availability_90'][1] = "Booked"
booked_sec_deposit
Out[ ]:
| availability_90 | min | mean | median | max | |
|---|---|---|---|---|---|
| 0 | Non-Booked | 0.0 | 193.270365 | 100.0 | 5000.0 |
| 1 | Booked | 0.0 | 189.195214 | 0.0 | 3000.0 |
In [ ]:
print("Value Counts Non-Booked:", df_prob_2.groupby('availability_90')['security_deposit'].value_counts()[0])
print("Percentages Non-Booked:", df_prob_2.groupby('availability_90')['security_deposit'].value_counts()[0] / df_prob_2.groupby('availability_90')['security_deposit'].count()[0])
print("Value Counts Booked:", df_prob_2.groupby('availability_90')['security_deposit'].value_counts()[1])
print("Percentages Booked:", df_prob_2.groupby('availability_90')['security_deposit'].value_counts()[1] / df_prob_2.groupby('availability_90')['security_deposit'].count()[1])
Value Counts Non-Booked: security_deposit 0.0 1227 500.0 242 100.0 213 200.0 206 250.0 151 300.0 126 1000.0 82 150.0 78 95.0 56 400.0 41 350.0 28 600.0 12 125.0 11 1500.0 11 750.0 10 2000.0 10 800.0 7 450.0 6 700.0 6 295.0 5 2500.0 5 99.0 4 195.0 3 1200.0 3 120.0 2 129.0 2 175.0 2 225.0 2 550.0 2 850.0 2 900.0 2 3000.0 2 160.0 1 180.0 1 185.0 1 234.0 1 240.0 1 275.0 1 280.0 1 290.0 1 315.0 1 375.0 1 650.0 1 675.0 1 945.0 1 999.0 1 1250.0 1 1600.0 1 1899.0 1 2900.0 1 5000.0 1 Name: security_deposit, dtype: int64 Percentages Non-Booked: security_deposit 0.0 0.475950 500.0 0.093871 100.0 0.082622 200.0 0.079907 250.0 0.058573 300.0 0.048875 1000.0 0.031808 150.0 0.030256 95.0 0.021722 400.0 0.015904 350.0 0.010861 600.0 0.004655 125.0 0.004267 1500.0 0.004267 750.0 0.003879 2000.0 0.003879 800.0 0.002715 450.0 0.002327 700.0 0.002327 295.0 0.001939 2500.0 0.001939 99.0 0.001552 195.0 0.001164 1200.0 0.001164 120.0 0.000776 129.0 0.000776 175.0 0.000776 225.0 0.000776 550.0 0.000776 850.0 0.000776 900.0 0.000776 3000.0 0.000776 160.0 0.000388 180.0 0.000388 185.0 0.000388 234.0 0.000388 240.0 0.000388 275.0 0.000388 280.0 0.000388 290.0 0.000388 315.0 0.000388 375.0 0.000388 650.0 0.000388 675.0 0.000388 945.0 0.000388 999.0 0.000388 1250.0 0.000388 1600.0 0.000388 1899.0 0.000388 2900.0 0.000388 5000.0 0.000388 Name: security_deposit, dtype: float64 Value Counts Booked: security_deposit 0.0 409 500.0 64 200.0 55 100.0 52 300.0 47 150.0 31 250.0 31 1000.0 18 350.0 12 400.0 12 95.0 11 750.0 7 450.0 4 600.0 4 1500.0 4 125.0 3 175.0 3 199.0 3 700.0 3 800.0 3 1200.0 3 3000.0 3 900.0 2 149.0 1 180.0 1 275.0 1 375.0 1 1100.0 1 1250.0 1 1400.0 1 2000.0 1 2400.0 1 2500.0 1 Name: security_deposit, dtype: int64 Percentages Booked: security_deposit 0.0 0.515113 500.0 0.080605 200.0 0.069270 100.0 0.065491 300.0 0.059194 150.0 0.039043 250.0 0.039043 1000.0 0.022670 350.0 0.015113 400.0 0.015113 95.0 0.013854 750.0 0.008816 450.0 0.005038 600.0 0.005038 1500.0 0.005038 125.0 0.003778 175.0 0.003778 199.0 0.003778 700.0 0.003778 800.0 0.003778 1200.0 0.003778 3000.0 0.003778 900.0 0.002519 149.0 0.001259 180.0 0.001259 275.0 0.001259 375.0 0.001259 1100.0 0.001259 1250.0 0.001259 1400.0 0.001259 2000.0 0.001259 2400.0 0.001259 2500.0 0.001259 Name: security_deposit, dtype: float64
In [ ]:
booked_room_type = df_prob_2.groupby('availability_90')['room_type'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_room_type['availability_90'] = booked_room_type['availability_90'].astype('string')
booked_room_type['availability_90'][0] = "Non-Booked"
booked_room_type['availability_90'][1] = "Booked"
booked_room_type
Out[ ]:
| availability_90 | min | mean | median | max | |
|---|---|---|---|---|---|
| 0 | Non-Booked | 0 | 0.330877 | 0.0 | 2 |
| 1 | Booked | 0 | 0.240554 | 0.0 | 2 |
In [ ]:
print("Value Counts:", df_prob_2.groupby('availability_90')['room_type'].value_counts())
print("Percentages:", df_prob_2.groupby('availability_90')['room_type'].value_counts() / df_prob_2.groupby('availability_90')['room_type'].count())
Value Counts: availability_90 room_type
0 0 1777
1 749
2 52
1 0 616
1 165
2 13
Name: room_type, dtype: int64
Percentages: availability_90 room_type
0 0 0.689294
1 0.290535
2 0.020171
1 0 0.775819
1 0.207809
2 0.016373
Name: room_type, dtype: float64
In [ ]:
booked_host_listings = df_prob_2.groupby('availability_90')['host_listings_count'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_host_listings['availability_90'] = booked_host_listings['availability_90'].astype('string')
booked_host_listings['availability_90'][0] = "Non-Booked"
booked_host_listings['availability_90'][1] = "Booked"
booked_host_listings
Out[ ]:
| availability_90 | min | mean | median | max | |
|---|---|---|---|---|---|
| 0 | Non-Booked | 1.0 | 17.251746 | 1.0 | 339.0 |
| 1 | Booked | 1.0 | 12.255668 | 1.0 | 339.0 |
In [ ]:
df_prob_2_temp = df_prob_2.copy()
df_prob_2_temp['instant_bookable'] = df_prob_2_temp['instant_bookable'].astype('int')
booked_instant = df_prob_2_temp.groupby('availability_90')['instant_bookable'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_instant['availability_90'] = booked_host_listings['availability_90'].astype('string')
booked_instant['availability_90'][0] = "Non-Booked"
booked_instant['availability_90'][1] = "Booked"
booked_instant
Out[ ]:
| availability_90 | min | mean | median | max | |
|---|---|---|---|---|---|
| 0 | Non-Booked | 0 | 0.117145 | 0.0 | 1 |
| 1 | Booked | 0 | 0.103275 | 0.0 | 1 |
In [ ]:
df_prob_2_temp = df_prob_2.copy()
df_prob_2_temp['host_identity_verified'] = df_prob_2_temp['host_identity_verified'].astype('int')
booked_host_ver = df_prob_2_temp.groupby('availability_90')['host_identity_verified'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_host_ver['availability_90'] = booked_host_ver['availability_90'].astype('string')
booked_host_ver['availability_90'][0] = "Non-Booked"
booked_host_ver['availability_90'][1] = "Booked"
booked_host_ver
Out[ ]:
| availability_90 | min | mean | median | max | |
|---|---|---|---|---|---|
| 0 | Non-Booked | 0 | 0.803724 | 1.0 | 1 |
| 1 | Booked | 0 | 0.751889 | 1.0 | 1 |
In [ ]:
df_prob_2_temp = df_prob_2.copy()
df_prob_2_temp['cancellation_policy'] = df_prob_2_temp['cancellation_policy'].astype('int')
print("Value Counts:", df_prob_2_temp.groupby('availability_90')['cancellation_policy'].value_counts())
print("Percentages:", df_prob_2_temp.groupby('availability_90')['cancellation_policy'].value_counts() / df_prob_2_temp.groupby('availability_90')['cancellation_policy'].count())
booked_cancel_policy = df_prob_2_temp.groupby('availability_90')['cancellation_policy'].agg(['min', 'mean', 'median', 'max']).reset_index()
booked_cancel_policy['availability_90'] = booked_host_ver['availability_90'].astype('string')
booked_cancel_policy['availability_90'][0] = "Non-Booked"
booked_cancel_policy['availability_90'][1] = "Booked"
print(booked_cancel_policy)
Value Counts: availability_90 cancellation_policy
0 3 1289
1 673
0 608
4 7
2 1
1 3 361
0 222
1 210
4 1
Name: cancellation_policy, dtype: int64
Percentages: availability_90 cancellation_policy
0 3 0.500000
1 0.261055
0 0.235842
4 0.002715
2 0.000388
1 3 0.454660
0 0.279597
1 0.264484
4 0.001259
Name: cancellation_policy, dtype: float64
availability_90 min mean median max
0 Non-Booked 0 1.772692 3.0 4
1 Booked 0 1.633501 1.0 4
flexible = 0, moderate = 1, no_refunds = 2, strict = 3, super_strict_30 = 4
In [ ]:
df_temp = df.copy()
df_temp['review_scores_rating'] = df_temp['review_scores_rating'].apply(lambda x: categorize_review_scores(x))
reviews_sec_deposit = df_temp.groupby('review_scores_rating')['security_deposit'].agg(['min', 'mean', 'median', 'max', 'count']).reset_index()
reviews_sec_deposit
Out[ ]:
| review_scores_rating | min | mean | median | max | count | |
|---|---|---|---|---|---|---|
| 0 | 2 | 0.0 | 0.000000 | 0.0 | 0.0 | 5 |
| 1 | 4 | 0.0 | 0.000000 | 0.0 | 0.0 | 1 |
| 2 | 5 | 0.0 | 83.333333 | 0.0 | 250.0 | 3 |
| 3 | 6 | 0.0 | 134.000000 | 0.0 | 1000.0 | 25 |
| 4 | 7 | 0.0 | 233.333333 | 0.0 | 3000.0 | 18 |
| 5 | 8 | 0.0 | 143.138365 | 0.0 | 2400.0 | 318 |
| 6 | 9 | 0.0 | 198.252498 | 100.0 | 5000.0 | 3002 |
In [ ]:
print("Value Counts 8:", df_temp.groupby('review_scores_rating')['security_deposit'].value_counts()[8])
print("Percentages 8:", df_temp.groupby('review_scores_rating')['security_deposit'].value_counts()[8] / df_temp.groupby('review_scores_rating')['security_deposit'].count()[8])
print("Value Counts 9:", df_temp.groupby('review_scores_rating')['security_deposit'].value_counts()[9])
print("Percentages 9:", df_temp.groupby('review_scores_rating')['security_deposit'].value_counts()[9] / df_temp.groupby('review_scores_rating')['security_deposit'].count()[9])
Value Counts 8: security_deposit 0.0 186 100.0 30 500.0 21 250.0 18 300.0 15 200.0 12 150.0 7 95.0 5 400.0 5 1000.0 5 175.0 2 700.0 2 125.0 1 195.0 1 234.0 1 240.0 1 800.0 1 1200.0 1 1250.0 1 1899.0 1 2000.0 1 2400.0 1 Name: security_deposit, dtype: int64 Percentages 8: security_deposit 0.0 0.584906 100.0 0.094340 500.0 0.066038 250.0 0.056604 300.0 0.047170 200.0 0.037736 150.0 0.022013 95.0 0.015723 400.0 0.015723 1000.0 0.015723 175.0 0.006289 700.0 0.006289 125.0 0.003145 195.0 0.003145 234.0 0.003145 240.0 0.003145 800.0 0.003145 1200.0 0.003145 1250.0 0.003145 1899.0 0.003145 2000.0 0.003145 2400.0 0.003145 Name: security_deposit, dtype: float64 Value Counts 9: security_deposit 0.0 1412 500.0 281 200.0 245 100.0 235 250.0 160 300.0 158 150.0 102 1000.0 94 95.0 62 400.0 48 350.0 40 750.0 17 600.0 16 1500.0 15 125.0 13 450.0 10 2000.0 10 800.0 9 700.0 7 2500.0 6 295.0 5 1200.0 5 99.0 4 900.0 4 3000.0 4 175.0 3 199.0 3 120.0 2 129.0 2 180.0 2 195.0 2 225.0 2 275.0 2 375.0 2 550.0 2 850.0 2 149.0 1 160.0 1 185.0 1 280.0 1 290.0 1 315.0 1 650.0 1 675.0 1 945.0 1 999.0 1 1100.0 1 1250.0 1 1400.0 1 1600.0 1 2900.0 1 5000.0 1 Name: security_deposit, dtype: int64 Percentages 9: security_deposit 0.0 0.470353 500.0 0.093604 200.0 0.081612 100.0 0.078281 250.0 0.053298 300.0 0.052632 150.0 0.033977 1000.0 0.031312 95.0 0.020653 400.0 0.015989 350.0 0.013324 750.0 0.005663 600.0 0.005330 1500.0 0.004997 125.0 0.004330 450.0 0.003331 2000.0 0.003331 800.0 0.002998 700.0 0.002332 2500.0 0.001999 295.0 0.001666 1200.0 0.001666 99.0 0.001332 900.0 0.001332 3000.0 0.001332 175.0 0.000999 199.0 0.000999 120.0 0.000666 129.0 0.000666 180.0 0.000666 195.0 0.000666 225.0 0.000666 275.0 0.000666 375.0 0.000666 550.0 0.000666 850.0 0.000666 149.0 0.000333 160.0 0.000333 185.0 0.000333 280.0 0.000333 290.0 0.000333 315.0 0.000333 650.0 0.000333 675.0 0.000333 945.0 0.000333 999.0 0.000333 1100.0 0.000333 1250.0 0.000333 1400.0 0.000333 1600.0 0.000333 2900.0 0.000333 5000.0 0.000333 Name: security_deposit, dtype: float64
In [ ]:
df_temp = df.copy()
df_temp['review_scores_rating'] = df_temp['review_scores_rating'].apply(lambda x: categorize_review_scores(x))
reviews_sec_deposit = df_temp.groupby('review_scores_rating')['weekly_price'].agg(['min', 'mean', 'median', 'max', 'count']).reset_index()
reviews_sec_deposit
Out[ ]:
| review_scores_rating | min | mean | median | max | count | |
|---|---|---|---|---|---|---|
| 0 | 2 | 105.0 | 2312.800000 | 2450.0 | 5313.0 | 5 |
| 1 | 4 | 3999.0 | 3999.000000 | 3999.0 | 3999.0 | 1 |
| 2 | 5 | 220.0 | 936.666667 | 525.0 | 2065.0 | 3 |
| 3 | 6 | 175.0 | 1446.920000 | 910.0 | 5593.0 | 25 |
| 4 | 7 | 420.0 | 1574.500000 | 775.0 | 7000.0 | 18 |
| 5 | 8 | 105.0 | 1289.025157 | 800.0 | 8750.0 | 318 |
| 6 | 9 | 95.0 | 1401.106596 | 900.0 | 17843.0 | 3002 |
In [ ]:
print("Mode 8:", df_temp.groupby('review_scores_rating')['weekly_price'].value_counts()[8])
print("Percentages 8:", max(df_temp.groupby('review_scores_rating')['weekly_price'].value_counts()[8] / df_temp.groupby('review_scores_rating')['weekly_price'].count()[8]))
print("Mode 9:", df_temp.groupby('review_scores_rating')['weekly_price'].value_counts()[9])
print("Percentages 9:", df_temp.groupby('review_scores_rating')['weekly_price'].value_counts()[9] / df_temp.groupby('review_scores_rating')['weekly_price'].count()[9])
Mode 8: weekly_price
1050.0 12
350.0 9
700.0 9
1750.0 9
250.0 7
..
6846.0 1
6999.0 1
7500.0 1
8400.0 1
8750.0 1
Name: weekly_price, Length: 153, dtype: int64
Percentages 8: 0.03773584905660377
Mode 9: weekly_price
1050.0 82
700.0 81
1400.0 71
500.0 63
875.0 57
..
12565.0 1
12950.0 1
15393.0 1
17500.0 1
17843.0 1
Name: weekly_price, Length: 588, dtype: int64
Percentages 9: weekly_price
1050.0 0.027315
700.0 0.026982
1400.0 0.023651
500.0 0.020986
875.0 0.018987
...
12565.0 0.000333
12950.0 0.000333
15393.0 0.000333
17500.0 0.000333
17843.0 0.000333
Name: weekly_price, Length: 588, dtype: float64
In [ ]:
df_temp = df.copy()
df_temp['review_scores_rating'] = df_temp['review_scores_rating'].apply(lambda x: categorize_review_scores(x))
df_temp['host_is_superhost'] = df_temp['host_is_superhost'].astype('int')
reviews_sec_deposit = df_temp.groupby('review_scores_rating')['host_is_superhost'].agg(['min', 'mean', 'median', 'max', 'count']).reset_index()
reviews_sec_deposit
Out[ ]:
| review_scores_rating | min | mean | median | max | count | |
|---|---|---|---|---|---|---|
| 0 | 2 | 0 | 0.000000 | 0.0 | 0 | 5 |
| 1 | 4 | 1 | 1.000000 | 1.0 | 1 | 1 |
| 2 | 5 | 0 | 0.000000 | 0.0 | 0 | 3 |
| 3 | 6 | 0 | 0.080000 | 0.0 | 1 | 25 |
| 4 | 7 | 0 | 0.055556 | 0.0 | 1 | 18 |
| 5 | 8 | 0 | 0.012579 | 0.0 | 1 | 318 |
| 6 | 9 | 0 | 0.247835 | 0.0 | 1 | 3002 |